# Data Cleanup

In [242]:
import pandas as pd
import re
import datetime
# Use nltk for tokenizer and stopwords removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [243]:
df = pd.read_csv('data/surfline_news_scrape.csv')

In [244]:
df = df[['date', 'title', 'tags_text', 'p_text']]
df.shape

(85, 4)

# Cleanup the data field

In [245]:
df['date']

0                 []
1      Nov 2nd, 2020
2     Oct 17th, 2020
3     Oct 13th, 2020
4     Jan 18th, 2021
           ...      
80    Jan 15th, 2021
81    Jan 15th, 2021
82    Jan 17th, 2021
83    Jan 18th, 2021
84    Jan 15th, 2021
Name: date, Length: 85, dtype: object

In [246]:
test_date = df['date'][1]
print('Datetime string example:')
print(df['date'][1])

Datetime string example:
Nov 2nd, 2020


In [247]:
date_array = df['date']
date_array

0                 []
1      Nov 2nd, 2020
2     Oct 17th, 2020
3     Oct 13th, 2020
4     Jan 18th, 2021
           ...      
80    Jan 15th, 2021
81    Jan 15th, 2021
82    Jan 17th, 2021
83    Jan 18th, 2021
84    Jan 15th, 2021
Name: date, Length: 85, dtype: object

In [248]:
from dateutil.parser import parse

dates = []
for item in date_array:
    try:
        date = str(parse(item))
        date2 = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
        date3 = date2.strftime('%Y-%m-%d')
        dates.append(date3)
        # print(date2.date())
    except:
        dates.append('n/a')

article_date = {'article_date': dates}

In [249]:
len(dates)

85

In [250]:
df['dates'] = dates

In [251]:
df = df[['dates', 'title', 'p_text', 'tags_text']]
df

Unnamed: 0,dates,title,p_text,tags_text
0,,Russo Cam: A watery look into the North Shore’...,"[""Quickly access the spots you care about most...","[""CBD"",""Daniel Russo"",""InnerG"",""pipeline""]"
1,2020-11-02,"Watch: Meet Kehu Butler, the 20-Year-Old Risin...","[""Quickly access the spots you care about most...",[]
2,2020-10-17,"With No 2020 Olympics, How're the Surfers Feel...","[""Quickly access the spots you care about most...","[""ISA"",""Jordy Smith"",""Olympics"",""sally fitzgib..."
3,2020-10-13,Watch Tom Carroll & Matt Grainger Break Down T...,"[""Quickly access the spots you care about most...",[]
4,2021-01-18,A Powerful South and SSE Groundswell Is About ...,"[""Quickly access the spots you care about most...",[]
...,...,...,...,...
80,2021-01-15,"Pre-Game Swell Hits Hawaii; ""Preheats Oven"" fo...","[""Quickly access the spots you care about most...","[""swell stories""]"
81,2021-01-15,How Will Jaws Stack Up? Comparing Pe’ahi‘s Gre...,"[""Quickly access the spots you care about most...","[""Jaws""]"
82,2021-01-17,Replay: Live From Jaws and Waimea on Super Swe...,"[""Quickly access the spots you care about most...","[""Jaws"",""surfline live""]"
83,2021-01-18,R.I.P. Ben Aipa (1942-2021),"[""Quickly access the spots you care about most...","[""Ben Aipa"",""Hawaii"",""r.i.p."",""shaper""]"


# Clean News Article Copy

In [252]:
text_list = []
for text1 in df['p_text']:
    # text_list = []
    string = "Quickly access the spots you care about most."
    new_str = text1.replace(string, '')
    stop_words = set(stopwords.words('english'))
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_tokens = tokenizer.tokenize(new_str)
    filtered_string = [w for w in word_tokens if not w in stop_words]
    # print(new_words)
    text_list.append(filtered_string)

In [274]:
title_list = []
for text2 in df['title']:
    stop_words = set(stopwords.words('english'))
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_tokens = tokenizer.tokenize(text2)
    filtered_string = [w for w in word_tokens if not w in stop_words]
    title_list.append(filtered_string)

In [214]:
# df.drop(df.tail(1).index, inplace=True)

In [254]:
df['tokenized_copy'] = text_list

In [276]:
df['tokenized_title'] = title_list

In [280]:
df2 = df[['dates', 'tokenized_title', 'tokenized_copy', 'tags_text']]
df2.shape

(85, 4)

In [281]:
df2.head()

Unnamed: 0,dates,tokenized_title,tokenized_copy,tags_text
0,,"[Russo, Cam, A, watery, look, North, Shore, se...","[On, clear, winter, day, balmy, coastline, Oah...","[""CBD"",""Daniel Russo"",""InnerG"",""pipeline""]"
1,2020-11-02,"[Watch, Meet, Kehu, Butler, 20, Year, Old, Ris...","[If, know, Kehu, Butler, yet, better, way, get...",[]
2,2020-10-17,"[With, No, 2020, Olympics, How, Surfers, Feeling]","[Two, historic, things, Surfing, inaugural, de...","[""ISA"",""Jordy Smith"",""Olympics"",""sally fitzgib..."
3,2020-10-13,"[Watch, Tom, Carroll, Matt, Grainger, Break, D...","[We, got, kinds, reef, around, play, When, two...",[]
4,2021-01-18,"[A, Powerful, South, SSE, Groundswell, Is, Abo...","[The, month, January, exactly, renowned, big, ...",[]
