# Data Cleanup

In [62]:
import pandas as pd
import re
import datetime
# Use nltk for tokenizer and stopwords removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [63]:
df = pd.read_csv('data/surfline_news_scrape.csv')

In [72]:
df = df[['date', 'title', 'tags_text', 'p_text']]
df

Unnamed: 0,date,title,tags_text,p_text
0,[],Russo Cam: A watery look into the North Shore’...,"[""CBD"",""Daniel Russo"",""InnerG"",""pipeline""]","[""Quickly access the spots you care about most..."
1,"Nov 2nd, 2020","Watch: Meet Kehu Butler, the 20-Year-Old Risin...",[],"[""Quickly access the spots you care about most..."
2,"Oct 17th, 2020","With No 2020 Olympics, How're the Surfers Feel...","[""ISA"",""Jordy Smith"",""Olympics"",""sally fitzgib...","[""Quickly access the spots you care about most..."
3,"Oct 13th, 2020",Watch Tom Carroll & Matt Grainger Break Down T...,[],"[""Quickly access the spots you care about most..."
4,"Jan 18th, 2021",A Powerful South and SSE Groundswell Is About ...,[],"[""Quickly access the spots you care about most..."
...,...,...,...,...
80,"Jan 15th, 2021","Pre-Game Swell Hits Hawaii; ""Preheats Oven"" fo...","[""swell stories""]","[""Quickly access the spots you care about most..."
81,"Jan 15th, 2021",How Will Jaws Stack Up? Comparing Pe’ahi‘s Gre...,"[""Jaws""]","[""Quickly access the spots you care about most..."
82,"Jan 17th, 2021",Replay: Live From Jaws and Waimea on Super Swe...,"[""Jaws"",""surfline live""]","[""Quickly access the spots you care about most..."
83,"Jan 18th, 2021",R.I.P. Ben Aipa (1942-2021),"[""Ben Aipa"",""Hawaii"",""r.i.p."",""shaper""]","[""Quickly access the spots you care about most..."


# Cleanup the data field

In [5]:
df['date']

0                 []
1      Nov 2nd, 2020
2     Oct 17th, 2020
3     Oct 13th, 2020
4     Jan 18th, 2021
           ...      
80    Jan 15th, 2021
81    Jan 15th, 2021
82    Jan 17th, 2021
83    Jan 18th, 2021
84    Jan 15th, 2021
Name: date, Length: 85, dtype: object

In [6]:
test_date = df['date'][1]
print('Datetime string example:')
print(df['date'][1])

Datetime string example:
Nov 2nd, 2020


In [7]:
date_array = df['date']
date_array

0                 []
1      Nov 2nd, 2020
2     Oct 17th, 2020
3     Oct 13th, 2020
4     Jan 18th, 2021
           ...      
80    Jan 15th, 2021
81    Jan 15th, 2021
82    Jan 17th, 2021
83    Jan 18th, 2021
84    Jan 15th, 2021
Name: date, Length: 85, dtype: object

In [8]:
from dateutil.parser import parse

dates = []
for item in date_array:
    try:
        date = str(parse(item))
        date2 = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
        date3 = date2.strftime('%Y-%m-%d')
        dates.append(date3)
        # print(date2.date())
    except:
        dates.append('n/a')

article_date = {'article_date': dates}

In [9]:
len(dates)

85

In [10]:
df['dates'] = dates

In [11]:
df = df[['dates', 'title', 'p_text', 'tags_text']]
df

Unnamed: 0,dates,title,p_text,tags_text
0,,Russo Cam: A watery look into the North Shore’...,"[""Quickly access the spots you care about most...","[""CBD"",""Daniel Russo"",""InnerG"",""pipeline""]"
1,2020-11-02,"Watch: Meet Kehu Butler, the 20-Year-Old Risin...","[""Quickly access the spots you care about most...",[]
2,2020-10-17,"With No 2020 Olympics, How're the Surfers Feel...","[""Quickly access the spots you care about most...","[""ISA"",""Jordy Smith"",""Olympics"",""sally fitzgib..."
3,2020-10-13,Watch Tom Carroll & Matt Grainger Break Down T...,"[""Quickly access the spots you care about most...",[]
4,2021-01-18,A Powerful South and SSE Groundswell Is About ...,"[""Quickly access the spots you care about most...",[]
...,...,...,...,...
80,2021-01-15,"Pre-Game Swell Hits Hawaii; ""Preheats Oven"" fo...","[""Quickly access the spots you care about most...","[""swell stories""]"
81,2021-01-15,How Will Jaws Stack Up? Comparing Pe’ahi‘s Gre...,"[""Quickly access the spots you care about most...","[""Jaws""]"
82,2021-01-17,Replay: Live From Jaws and Waimea on Super Swe...,"[""Quickly access the spots you care about most...","[""Jaws"",""surfline live""]"
83,2021-01-18,R.I.P. Ben Aipa (1942-2021),"[""Quickly access the spots you care about most...","[""Ben Aipa"",""Hawaii"",""r.i.p."",""shaper""]"


# Clean News Article Copy

In [12]:
text_list = []
for text1 in df['p_text']:
    # text_list = []
    string = "Quickly access the spots you care about most."
    new_str = text1.replace(string, '')
    stop_words = set(stopwords.words('english'))
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_tokens = tokenizer.tokenize(new_str)
    filtered_string = [w for w in word_tokens if not w in stop_words]
    # print(new_words)
    text_list.append(filtered_string)

In [13]:
title_list = []
for text2 in df['title']:
    stop_words = set(stopwords.words('english'))
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_tokens = tokenizer.tokenize(text2)
    filtered_string = [w for w in word_tokens if not w in stop_words]
    title_list.append(filtered_string)

In [71]:
tags_list = []
tags_text = df['tags_text']
type(tags_text[0])

str

In [14]:
# df.drop(df.tail(1).index, inplace=True)

In [15]:
df['tokenized_copy'] = text_list

In [16]:
df['tokenized_title'] = title_list

In [17]:
df2 = df[['dates', 'tokenized_title', 'tokenized_copy', 'tags_text']]
df2.shape

(85, 4)

In [18]:
df2.head()

Unnamed: 0,dates,tokenized_title,tokenized_copy,tags_text
0,,"[Russo, Cam, A, watery, look, North, Shore, se...","[On, clear, winter, day, balmy, coastline, Oah...","[""CBD"",""Daniel Russo"",""InnerG"",""pipeline""]"
1,2020-11-02,"[Watch, Meet, Kehu, Butler, 20, Year, Old, Ris...","[If, know, Kehu, Butler, yet, better, way, get...",[]
2,2020-10-17,"[With, No, 2020, Olympics, How, Surfers, Feeling]","[Two, historic, things, Surfing, inaugural, de...","[""ISA"",""Jordy Smith"",""Olympics"",""sally fitzgib..."
3,2020-10-13,"[Watch, Tom, Carroll, Matt, Grainger, Break, D...","[We, got, kinds, reef, around, play, When, two...",[]
4,2021-01-18,"[A, Powerful, South, SSE, Groundswell, Is, Abo...","[The, month, January, exactly, renowned, big, ...",[]


In [40]:
def create_string(tokens):
    string = ''
    for token in tokens:
        string += token + ' ' 
    return string

titles_text_lists = df2['tokenized_title']
titles_string = [create_string(token_list) for token_list in titles_text_lists]
len(titles_string)
# titles_string[0]


# create_string(text2)
# text2 = df2['tokenized_title'][0]
# text1 = ''
# # text2 = text2.replace(''', '')
# for text in text2:
#     text1 += text + ' '
# text1

85

In [43]:
copy_text_lists = df2['tokenized_copy']
copy_string = [create_string(token_list) for token_list in copy_text_lists]
copy_string[0]

'On clear winter day balmy coastline Oahu North Shore Daniel Russo floats serenely The sea agitated churning sucking toward horizon Banzai Pipeline crowd scratches toward incoming west set capped slightly Second Reef standing tall slab like brave takers willing turn First Reef behemoth But underground heavies jockey visiting CT pros brazen youth trying make name Daniel holds position treading water one arm clutching heavy water housing bizarre portrait calm feet impact zone world deadliest wave Of course Daniel first rodeo He earned stripes lucky us get intimate look world Inner G newest episode Russo Cam access pass North Shore shot North Shore finest water lensman Daniel Russo Just growing North Shore backyard powerful playground describes Russo And I think drew lifestyle It attraction energy excitement I could see happening beach water I thought pretty cool water photography like secret society artists earn way Certainly D Russo Now press play join journey Because good body good psy

In [52]:
tags_text_list = df2['tags_text'][0]
text5 = re.sub(r'\[.*?\]+', '', tags_text_list)
tags_text_list.replace('[', '')

'"CBD","Daniel Russo","InnerG","pipeline"]'

In [60]:
def clean_str(string):
    string = string.replace('[', '')
    string = string.replace(']', '')
    string = string.replace('"', '')
    string = string.replace(',', ' ')
    return string

clean_str(tags_text_list)

tags_text_list = df2['tags_text']
tags_string = [clean_str(token_list) for token_list in tags_text_list]

print(len(tags_string))
print(tags_string[0])

85
CBD Daniel Russo InnerG pipeline


In [66]:
surfline_df = df2
surfline_df['string_titles'] = titles_string
surfline_df['string_copy'] = copy_string
surfline_df['string_tags'] = tags_string
surfline_df

Unnamed: 0,dates,tokenized_title,tokenized_copy,tags_text,string_titles,string_copy,string_tags
0,,"[Russo, Cam, A, watery, look, North, Shore, se...","[On, clear, winter, day, balmy, coastline, Oah...","[""CBD"",""Daniel Russo"",""InnerG"",""pipeline""]",Russo Cam A watery look North Shore secret soc...,On clear winter day balmy coastline Oahu North...,CBD Daniel Russo InnerG pipeline
1,2020-11-02,"[Watch, Meet, Kehu, Butler, 20, Year, Old, Ris...","[If, know, Kehu, Butler, yet, better, way, get...",[],Watch Meet Kehu Butler 20 Year Old Rising Surf...,If know Kehu Butler yet better way get acquain...,
2,2020-10-17,"[With, No, 2020, Olympics, How, Surfers, Feeling]","[Two, historic, things, Surfing, inaugural, de...","[""ISA"",""Jordy Smith"",""Olympics"",""sally fitzgib...",With No 2020 Olympics How Surfers Feeling,Two historic things Surfing inaugural debut Ol...,ISA Jordy Smith Olympics sally fitzgibbons
3,2020-10-13,"[Watch, Tom, Carroll, Matt, Grainger, Break, D...","[We, got, kinds, reef, around, play, When, two...",[],Watch Tom Carroll Matt Grainger Break Down Tow...,We got kinds reef around play When two time Wo...,
4,2021-01-18,"[A, Powerful, South, SSE, Groundswell, Is, Abo...","[The, month, January, exactly, renowned, big, ...",[],A Powerful South SSE Groundswell Is About To R...,The month January exactly renowned big south s...,
...,...,...,...,...,...,...,...
80,2021-01-15,"[Pre, Game, Swell, Hits, Hawaii, Preheats, Ove...","[Editor, Note, Stay, tuned, right, realtime, c...","[""swell stories""]",Pre Game Swell Hits Hawaii Preheats Oven Satur...,Editor Note Stay tuned right realtime coverage...,swell stories
81,2021-01-15,"[How, Will, Jaws, Stack, Up, Comparing, Pe, ah...","[Editor, Note, Stay, tuned, right, realtime, c...","[""Jaws""]",How Will Jaws Stack Up Comparing Pe ahi Greate...,Editor Note Stay tuned right realtime coverage...,Jaws
82,2021-01-17,"[Replay, Live, From, Jaws, Waimea, Super, Swel...","[Saturday, afternoon, bombing, northwest, swel...","[""Jaws"",""surfline live""]",Replay Live From Jaws Waimea Super Swell Satur...,Saturday afternoon bombing northwest swell pea...,Jaws surfline live
83,2021-01-18,"[R, I, P, Ben, Aipa, 1942, 2021]","[When, Ben, Aipa, big, heart, finally, stopped...","[""Ben Aipa"",""Hawaii"",""r.i.p."",""shaper""]",R I P Ben Aipa 1942 2021,When Ben Aipa big heart finally stopped beatin...,Ben Aipa Hawaii r.i.p. shaper


In [83]:
article_word_count = [len(item) for item in surfline_df['tokenized_copy']]
len(testlist)

85

In [84]:
surfline_df['word_count'] = article_word_count
surfline_df

Unnamed: 0,dates,tokenized_title,tokenized_copy,tags_text,string_titles,string_copy,string_tags,word_count
0,,"[Russo, Cam, A, watery, look, North, Shore, se...","[On, clear, winter, day, balmy, coastline, Oah...","[""CBD"",""Daniel Russo"",""InnerG"",""pipeline""]",Russo Cam A watery look North Shore secret soc...,On clear winter day balmy coastline Oahu North...,CBD Daniel Russo InnerG pipeline,197
1,2020-11-02,"[Watch, Meet, Kehu, Butler, 20, Year, Old, Ris...","[If, know, Kehu, Butler, yet, better, way, get...",[],Watch Meet Kehu Butler 20 Year Old Rising Surf...,If know Kehu Butler yet better way get acquain...,,105
2,2020-10-17,"[With, No, 2020, Olympics, How, Surfers, Feeling]","[Two, historic, things, Surfing, inaugural, de...","[""ISA"",""Jordy Smith"",""Olympics"",""sally fitzgib...",With No 2020 Olympics How Surfers Feeling,Two historic things Surfing inaugural debut Ol...,ISA Jordy Smith Olympics sally fitzgibbons,280
3,2020-10-13,"[Watch, Tom, Carroll, Matt, Grainger, Break, D...","[We, got, kinds, reef, around, play, When, two...",[],Watch Tom Carroll Matt Grainger Break Down Tow...,We got kinds reef around play When two time Wo...,,148
4,2021-01-18,"[A, Powerful, South, SSE, Groundswell, Is, Abo...","[The, month, January, exactly, renowned, big, ...",[],A Powerful South SSE Groundswell Is About To R...,The month January exactly renowned big south s...,,514
...,...,...,...,...,...,...,...,...
80,2021-01-15,"[Pre, Game, Swell, Hits, Hawaii, Preheats, Ove...","[Editor, Note, Stay, tuned, right, realtime, c...","[""swell stories""]",Pre Game Swell Hits Hawaii Preheats Oven Satur...,Editor Note Stay tuned right realtime coverage...,swell stories,426
81,2021-01-15,"[How, Will, Jaws, Stack, Up, Comparing, Pe, ah...","[Editor, Note, Stay, tuned, right, realtime, c...","[""Jaws""]",How Will Jaws Stack Up Comparing Pe ahi Greate...,Editor Note Stay tuned right realtime coverage...,Jaws,598
82,2021-01-17,"[Replay, Live, From, Jaws, Waimea, Super, Swel...","[Saturday, afternoon, bombing, northwest, swel...","[""Jaws"",""surfline live""]",Replay Live From Jaws Waimea Super Swell Satur...,Saturday afternoon bombing northwest swell pea...,Jaws surfline live,74
83,2021-01-18,"[R, I, P, Ben, Aipa, 1942, 2021]","[When, Ben, Aipa, big, heart, finally, stopped...","[""Ben Aipa"",""Hawaii"",""r.i.p."",""shaper""]",R I P Ben Aipa 1942 2021,When Ben Aipa big heart finally stopped beatin...,Ben Aipa Hawaii r.i.p. shaper,645


In [86]:
surfline_df.to_csv('data/surfline_clean_data.csv', index=False)