### Loading packages, reading data

In [1]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [2]:
data = pd.read_csv("/home/linus/Documents/Skola/financial_sentiment_analysis_project/Data/all-data.csv", encoding="ISO-8859-1", header = None)
data.columns = ["Sentiment", "Sentence"]

In [8]:
rel_freq = data.Sentiment.value_counts()/data.shape[0]

In [10]:
rel_freq

neutral     0.594098
positive    0.281263
negative    0.124639
Name: Sentiment, dtype: float64

### splitting data into 60/20/20 for both balanced and unbalanced data

In [4]:
train, test = train_test_split(data, test_size = 0.20, random_state = 100)


In [5]:
train, valid = train_test_split(train, test_size = 0.25, random_state = 100)


In [6]:
train

Unnamed: 0,Sentiment,Sentence
3379,neutral,"The total size of the complex is around 25,000..."
2974,neutral,"ISMS does not disclose its financial results ,..."
420,neutral,"Capman , the Finnish alternative asset manager..."
1015,neutral,Juha-Pekka Weckstr+¦m has been appointed Presi...
498,positive,Most of the increase in net sales was due to t...
...,...,...
2197,positive,"The 50-50 joint venture , to be called Nokia S..."
4800,negative,`` Operating profit declined mainly due to the...
740,positive,Repeats sees 2008 EBITA above 18 pct of sales .
1186,neutral,BasWare Order Matching automatically matches p...


In [7]:
valid

Unnamed: 0,Sentiment,Sentence
3600,neutral,"For 2009 , net profit was EUR3m and the compan..."
2308,positive,"The EBITDA was EUR116m compared to EUR115m , c..."
1178,neutral,"As a result of the merger , the largest profes..."
246,positive,Operating profit rose to EUR 13.5 mn from EUR ...
940,positive,"According to a report by Neomarkka , Kuitu Fin..."
...,...,...
3645,neutral,Inha Works has invested in its product develop...
3930,neutral,The unit 's clients are mainly in the field of...
748,positive,`` This is a win for all OEMs targeting to dev...
4620,neutral,- Among other Finnish shares traded in the US ...


In [8]:
test

Unnamed: 0,Sentiment,Sentence
1903,neutral,Kesko www.kesko.fi is a Finnish retail special...
1625,neutral,"The contracts awarded to date , in connection ..."
1567,positive,Nokia will continue to invest in future develo...
208,positive,"Ragutis , which is controlled by the Finnish b..."
1225,neutral,"Following the registration , the number of iss..."
...,...,...
3954,neutral,Tikkurila acquired a majority stake in Kolorit...
2937,neutral,Her work at NetApp included strategically repo...
709,positive,"Due to rapid expansion , the market share of T..."
4628,neutral,Sales in local currencies decreased by 0.5 per...


In [9]:
2907/4845

0.6

### Creating a balanced dataset using naive Random oversampling

Trying to create a balanced data set using random oversampling, data points of the minority classes will be sampled with replacement and appended to the original data. Using the library imblearn to solve this.

In [10]:
# Using code similar to the documentation example. 

random_sampler = RandomOverSampler(random_state= 100)
balanced_sentences, balanced_sentiment = random_sampler.fit_resample(train.Sentence.values.reshape(2907,1), train.Sentiment)

In [11]:
train_balanced = pd.DataFrame(
    {"Sentences" : balanced_sentences[:,0],
    "Sentiment" : balanced_sentiment.values
    }
)

In [15]:
train_balanced.Sentiment.value_counts()

neutral     1715
positive    1715
negative    1715
Name: Sentiment, dtype: int64

In [16]:
train_balanced

Unnamed: 0,Sentences,Sentiment
0,"The total size of the complex is around 25,000...",neutral
1,"ISMS does not disclose its financial results ,...",neutral
2,"Capman , the Finnish alternative asset manager...",neutral
3,Juha-Pekka Weckstr+¦m has been appointed Presi...,neutral
4,Most of the increase in net sales was due to t...,positive
...,...,...
5140,Satama 's net sales would be higher than the y...,positive
5141,Growth was strongest in F-Secure 's operator I...,positive
5142,"Operating profit totaled EUR 825mn , up from E...",positive
5143,The money will be spent `` on strengthening th...,positive


### Writing to csv files

In [13]:
train.to_csv("/home/linus/Documents/Skola/financial_sentiment_analysis_project/Data/train.csv", index=False)
valid.to_csv("/home/linus/Documents/Skola/financial_sentiment_analysis_project/Data/valid.csv", index=False)
test.to_csv("/home/linus/Documents/Skola/financial_sentiment_analysis_project/Data/test.csv", index=False)

In [14]:
train_balanced.to_csv("/home/linus/Documents/Skola/financial_sentiment_analysis_project/Data/train_balanced.csv", index=False)

# Links used

https://imbalanced-learn.org/stable/over_sampling.html#naive-random-over-sampling