In [135]:
import pandas as pd
import chardet
import re

In [151]:
file = "./headline_finetune.csv"
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

In [157]:
ft_data= pd.read_csv(file, encoding = "Windows-1252", names=["sentiment", "headlines"])
ft_data

Unnamed: 0,sentiment,headlines
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


## Preprocessing
### Text Cleaning

In [158]:
def text_cleaner(text):
    # (1) 't -> not
    regex1 = re.compile('(\'t)|([a-zA-Z]\'t)') 
    # (2) remove URL, word+word, successive hyphens 
    regex2 = re.compile('(www\.[a-zA-Z_0-9]+\.[a-zA-Z_]+)|([a-zA-Z]+\+\s*[a-zA-Z]*)') 
    # (3) remove 'mn' 
    regex3 = re.compile('(\s+mn\s+)|(\s+m\s+)')
    # (4) remove punctation & special char
    regex4 = re.compile(r'[\'\",&^#@)(;:]')
    # (5) remove space before % 
    regex5 = re.compile('(\s+\%)|(\s+percent)|(\s+per cent)|(\s+pct)')
    # (6) remove space after -
    regex6 = re.compile('\-\s+')
    # (7) add year flag
    regex7 = re.compile('(19[0-9]{2})|(20[0-2]{1}[0-9]{1})')
    # (8) remove white space before "."
    regex8 = re.compile('\s+\.')
    # (9) change "-" to "~" between num%
    regex9 = re.compile('\%\s+\-\s*')
    # (10) add a white space between eur/euro and number
    regex10 = re.compile('(\s+eur\s+)|(\s+euro\s+)')
    # (11) add a white space between usd/dollar/dollars/$ and number
    regex11 = re.compile('(\s+usd\s+)|(\s+dollar\s+)|(\s+dollars\s+)|(\s*\$\s*)')
    # (12) remove multiple spaces
    regex12 = re.compile('[ ]+')
    
    cleaning1 = regex1.sub('not', text)
    cleaning2 = regex2.sub(' ', cleaning1)
    cleaning3 = regex3.sub(' million ', cleaning2)
    cleaning4 = regex4.sub(' ', cleaning3)
    cleaning5 = regex5.sub('%', cleaning4)
    cleaning6 = regex6.sub('-', cleaning5)
    cleaning7 = regex7.sub(' year ', cleaning6)
    cleaning8 = regex8.sub('', cleaning7)
    cleaning9 = regex9.sub('% ~ ', cleaning8)
    cleaning10 = regex10.sub(' eur ', cleaning9)
    cleaning11 = regex11.sub(' $ ', cleaning10)
    cleaning12 = regex12.sub(' ', cleaning11)  
    
    return cleaning12

In [159]:
ft_data.headlines = ft_data.headlines.apply(lambda x:text_cleaner(x.lower()))

### Encoding Dichotomous Sentiments

In [160]:
neutral_list = list(ft_data.loc[ft_data["sentiment"] == "neutral"].index)
ft_data = ft_data.drop(neutral_list, axis=0).reset_index(drop=True)
ft_data.sentiment = pd.get_dummies(ft_data.sentiment, drop_first = True)

In [161]:
ft_data.to_csv("/Users/alex/Desktop/deep_stock/ft_data1.csv", index=True, header = True)