In [16]:
import pandas as pd
from datetime import datetime
import chardet

# this data is not encoded by "uft-8" - gives error on default read_csv
# Detecting encoding
with open('./sentiment_analysis_data.csv', 'rb') as file:
    result = chardet.detect(file.read())
    print(result)
    encoding = result['encoding']

headline_sentiment_data = pd.read_csv('./sentiment_analysis_data.csv', encoding='Windows-1252')
# clean up weird syntax in original dataset, and create 'sentiment' and 'headline' columns
new_row = pd.DataFrame([headline_sentiment_data.columns], columns=['sentiment', 'headline'])
headline_sentiment_data.columns = ['sentiment', 'headline']
headline_sentiment_data = pd.concat([new_row, headline_sentiment_data], ignore_index=True)

print(headline_sentiment_data)
print(headline_sentiment_data.columns)


{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
     sentiment                                           headline
0      neutral  According to Gran , the company has no plans t...
1      neutral  Technopolis plans to develop in stages an area...
2     negative  The international electronic industry company ...
3     positive  With the new production plant the company woul...
4     positive  According to the company 's updated strategy f...
...        ...                                                ...
4841  negative  LONDON MarketWatch -- Share prices ended lower...
4842   neutral  Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843  negative  Operating profit fell to EUR 35.4 mn from EUR ...
4844  negative  Net sales of the Paper segment decreased to EU...
4845  negative  Sales in Finland decreased by 10.5 % in Januar...

[4846 rows x 2 columns]
Index(['sentiment', 'headline'], dtype='object')


### Minimal Data Cleaning

In [18]:
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    # Keep punctuation like . and ' - They were altering the meaning of text in some case
    # e.g. Rinkuskiai 's beer sales fell by 6.5 per cent ... => rinkuskiai s beer sales fell by 65 per cent to...
    text = re.sub(r'[^\w\s\.\']', '', text)
    text = re.sub(r'\s+\'s', "'s", text)  # Remove space before 's
    return text.strip()

headline_sentiment_data.headline = headline_sentiment_data.headline.apply(lambda x:clean_text(x.lower()))

print(headline_sentiment_data)

sentiment_counts = headline_sentiment_data['sentiment'].value_counts()
print(sentiment_counts)


     sentiment                                           headline
0      neutral  according to gran  the company has no plans to...
1      neutral  technopolis plans to develop in stages an area...
2     negative  the international electronic industry company ...
3     positive  with the new production plant the company woul...
4     positive  according to the company's updated strategy fo...
...        ...                                                ...
4841  negative  london marketwatch  share prices ended lower i...
4842   neutral  rinkuskiai's beer sales fell by 6.5 per cent t...
4843  negative  operating profit fell to eur 35.4 mn from eur ...
4844  negative  net sales of the paper segment decreased to eu...
4845  negative  sales in finland decreased by 10.5  in january...

[4846 rows x 2 columns]
sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64


### Drop Neutral Sentiment

In [24]:
neutral_list = list(headline_sentiment_data.loc[headline_sentiment_data["sentiment"] == "neutral"].index)
headline_sentiment_data = headline_sentiment_data.drop(neutral_list, axis=0).reset_index(drop=True)
headline_sentiment_data.sentiment = pd.get_dummies(headline_sentiment_data.sentiment, drop_first = True)
print(headline_sentiment_data)

      sentiment                                           headline
0         False  the international electronic industry company ...
1          True  with the new production plant the company woul...
2          True  according to the company's updated strategy fo...
3          True  financing of aspocomp's growth aspocomp is agg...
4          True  for the last quarter of 2010  componenta's net...
...         ...                                                ...
1962      False  helsinki thomson financial  shares in cargotec...
1963      False  london marketwatch  share prices ended lower i...
1964      False  operating profit fell to eur 35.4 mn from eur ...
1965      False  net sales of the paper segment decreased to eu...
1966      False  sales in finland decreased by 10.5  in january...

[1967 rows x 2 columns]


### Convert Sentiment into Numerical Values

In [26]:
sentiment_mapping = {False: 0, True: 1}
headline_sentiment_data["sentiment"] = headline_sentiment_data["sentiment"].map(sentiment_mapping)

print(headline_sentiment_data)

      sentiment                                           headline
0             0  the international electronic industry company ...
1             1  with the new production plant the company woul...
2             1  according to the company's updated strategy fo...
3             1  financing of aspocomp's growth aspocomp is agg...
4             1  for the last quarter of 2010  componenta's net...
...         ...                                                ...
1962          0  helsinki thomson financial  shares in cargotec...
1963          0  london marketwatch  share prices ended lower i...
1964          0  operating profit fell to eur 35.4 mn from eur ...
1965          0  net sales of the paper segment decreased to eu...
1966          0  sales in finland decreased by 10.5  in january...

[1967 rows x 2 columns]


In [28]:
headline_sentiment_data.to_csv("sentiment_data_cleaned_up.csv", index=False)