In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd

# preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load dataset into dataframe
df = pd.read_csv('../../data/train_data.csv')

# dataset shape: (rows, columns)
display(df.shape)

# 5 random datapoints
df.sample(5)

(3192, 2)

Unnamed: 0,Sentence,Sentiment
2510,$SAVE breaking it's downtrend line on increasi...,positive
2257,Finnish OKO bank has signed a cooperation agre...,positive
1815,Mr Skogster currently serves as the manager re...,neutral
469,InterContinental Hotels first-quarter global r...,negative
87,Diluted earnings per share ( EPS ) declined to...,negative


In [4]:
# preprocess documents
# remove special characters, stopwords
# lemmatization
clean_sentence = preprocess_corpus(df.Sentence)

clean_sentence.head()

0    upm kymmene one world leading printing paper p...
1     nokia pct eur kicking morning negative territory
2    vasantha appointed managing director incap con...
3    consolidated net sale increased reach eur oper...
4    cabot export production mainly goodyear bridge...
Name: Sentence, dtype: object

In [5]:
# spawn a tfidf vectorizer
vectorizer = TfidfVectorizer()

# train and vectorize clean_sentence column
vectors = vectorizer.fit_transform(clean_sentence)

In [6]:
# extract tfidf vectors as dataframe
df_tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

# add Sentiment column to TF-IDF vector data
df_tfidf['Sentiment'] = df['Sentiment']

# shape
display(df_tfidf.shape)

# first 5 datapoints
df_tfidf.head()

(3192, 6683)

Unnamed: 0,aal,aaland,aalto,aapl,aaron,aava,abb,abbott,abbv,aberdeen,...,zain,zainalabedin,zanadvorov,zao,zinc,zloty,znga,zoltan,zone,Sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral
