# Data Cleaning

## Import Libraries and Load the data

In [115]:
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import re
from nltk.tokenize import RegexpTokenizer

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
np.set_printoptions(precision=4)

In [72]:
df = pd.read_csv('resources/scrape1.csv')

In [101]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price,datetime
0,31 Jan 2022,JiffyBag,RE: SAS,"For me, it was just a fun exercise to do, and ...",No Opinion,13.8,2022-01-31
1,31 Jan 2022,JiffyBag,RE: SAS,"Hi Walkabout,I fully acknoweldege your rationa...",No Opinion,13.8,2022-01-31
2,31 Jan 2022,rosso123,Newcrest pulling a fast one,"NCM wants this 5% done and dusted asap, otherw...",No Opinion,13.8,2022-01-31
3,31 Jan 2022,JiffyBag,RE: Why would u short this,"Merc,It would be fantastic if someone could ac...",No Opinion,13.8,2022-01-31
4,31 Jan 2022,Philbrim,Ups n downs.,We go down 5% and most people are so volatile....,No Opinion,13.8,2022-01-31


## Data Cleaning

### Date Column

As the date column labels the current day (Today) and 5 previous days listed as the weekday only (the day of month is omitted), some pre-cleaning must be completed on csv scrapes that include the current day and 5 days prior to the current day, prior to concantenating to any other scrapes, to account for different scrape dates.

In [4]:
weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Today']

In [5]:
# view which days are in the format of Day, Time

for weekday in weekdays:
    print(weekday, len(df.loc[df['date'].str.contains(weekday)]))

Mon 0
Tue 0
Wed 309
Thu 282
Fri 352
Sat 96
Sun 146
Today 225


In [6]:
# to access the first date in the format of day, month, year, work out how many are in the other format

bad_format = 0
for weekday in weekdays:
    bad_format += len(df.loc[df['date'].str.contains(weekday)])

bad_format

1410

In [88]:
# view the first date with the correct format
# note when indexing with .loc, as the first line is 0 we dont need to add 1 to bad_format to view the proceding date

pd.to_datetime(df.loc[bad_format, 'date']).strftime('%d %b %Y')

'25 Jan 2022'

In [95]:
# calculate what day today is

today = pd.to_datetime(df.loc[bad_format, 'date']) + timedelta(6)
today.strftime('%d %b %Y')

'31 Jan 2022'

In [96]:
# work out what the other bad format days correspond to
today_m1 = today - timedelta(1)
today_m2 = today - timedelta(2)
today_m3 = today - timedelta(3)
today_m4 = today - timedelta(4)
today_m5 = today - timedelta(5)

In [97]:
# creat dict to map datetime.dayofweek to the bad format given in the csv
week_dict = {0:'Mon',
             1:'Tue',
             2:'Wed',
             3:'Thu',
             4:'Fri',
             5:'Sat',
             6:'Sun'}

In [99]:
# modify the dates with the bad format to align with the rest of the dataframe

df.loc[df['date'].str.contains('Today') ,'date'] = today.strftime('%d %b %Y')
df.loc[df['date'].str.contains(week_dict[today_m1.dayofweek]) ,'date'] = today_m1.strftime('%d %b %Y')
df.loc[df['date'].str.contains(week_dict[today_m2.dayofweek]) ,'date'] = today_m2.strftime('%d %b %Y')
df.loc[df['date'].str.contains(week_dict[today_m3.dayofweek]) ,'date'] = today_m3.strftime('%d %b %Y')
df.loc[df['date'].str.contains(week_dict[today_m4.dayofweek]) ,'date'] = today_m4.strftime('%d %b %Y')
df.loc[df['date'].str.contains(week_dict[today_m5.dayofweek]) ,'date'] = today_m5.strftime('%d %b %Y')

In [102]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price,datetime
0,31 Jan 2022,JiffyBag,RE: SAS,"For me, it was just a fun exercise to do, and ...",No Opinion,13.8,2022-01-31
1,31 Jan 2022,JiffyBag,RE: SAS,"Hi Walkabout,I fully acknoweldege your rationa...",No Opinion,13.8,2022-01-31
2,31 Jan 2022,rosso123,Newcrest pulling a fast one,"NCM wants this 5% done and dusted asap, otherw...",No Opinion,13.8,2022-01-31
3,31 Jan 2022,JiffyBag,RE: Why would u short this,"Merc,It would be fantastic if someone could ac...",No Opinion,13.8,2022-01-31
4,31 Jan 2022,Philbrim,Ups n downs.,We go down 5% and most people are so volatile....,No Opinion,13.8,2022-01-31


### comment field

In [109]:
df.comment[20]

"On average, they expect Greatland Gold's stock price to reach GBX 24.50 in the next twelve months. This suggests a possible upside of 73.8% from the stock's current price.73.8% ?You donâ\x80\x99t get that down Barclays or hsbc!!And a lot think thatâ\x80\x99s very conservative"

Prior to any NLP work it will be neccessary to remove any bad text encoding information from the comments field. At this stage we wont be looking to tokenize the data, rather to remove special characters and numbers

In [110]:
# see if there are any null rows in the data set
df[df.isnull().sum(axis=1)>0]

Unnamed: 0,date,user,title,comment,opinion,price,datetime


In [111]:
# this function will be used to process the comments into a clean comment column
# clean comment columns will be more workable for NLP

def text_clean(text):
    # string text
    st = str(text)
    
    # make lowercase
    lc = st.lower()
    
    # remove any web addresses
    rem_url = re.sub(r'http\S+', '', lc)
    
    # remove any special characters
    rem_char = re.sub(r'[^a-z ]+', '', rem_url)
    
    # remove whitespace
    rem_ws = re.sub('\s\s+', ' ', rem_char)
    
    # tokenize words
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_ws)
    
    # remove any words with 2 characters or less
    filtered_words = [w for w in tokens if len(w) > 2]
    
    # return tokens rejoined into a string
    return ' '.join(filtered_words)

In [116]:
df['comment_clean'] = df['comment'].map(lambda x: text_clean(x))

In [137]:
df.head()

Unnamed: 0,date,user,title,comment,opinion,price,datetime,comment_clean
0,31 Jan 2022,JiffyBag,RE: SAS,"For me, it was just a fun exercise to do, and ...",No Opinion,13.8,2022-01-31,for was just fun exercise and will have see wh...
1,31 Jan 2022,JiffyBag,RE: SAS,"Hi Walkabout,I fully acknoweldege your rationa...",No Opinion,13.8,2022-01-31,walkabouti fully acknoweldege your rationale e...
2,31 Jan 2022,rosso123,Newcrest pulling a fast one,"NCM wants this 5% done and dusted asap, otherw...",No Opinion,13.8,2022-01-31,ncm wants this done and dusted asap otherwise ...
3,31 Jan 2022,JiffyBag,RE: Why would u short this,"Merc,It would be fantastic if someone could ac...",No Opinion,13.8,2022-01-31,mercit would fantastic someone could actually ...
4,31 Jan 2022,Philbrim,Ups n downs.,We go down 5% and most people are so volatile....,No Opinion,13.8,2022-01-31,down and most people are volatile and everyone...


In [142]:
# see if there are any null rows in the data set
df[df.isnull().sum(axis=1)>0]

Unnamed: 0,date,user,title,comment,opinion,price,datetime,comment_clean


In [139]:
# drop any nul rows if neccessary
df.dropna(inplace=True)

## Export to csv

In [134]:
# save to csv
df.to_csv('resources/scrape1_clean.csv', index=False)

## NLP EDA

### Import NLP Packages

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.text import Text
import re
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix, precision_score, classification_report


tokenizer = RegexpTokenizer(r'\w+')