In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd

# preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load dataset into dataframe
df = pd.read_csv('../../data/train_data.csv')

# dataset shape: (rows, columns)
display(df.shape)

# 5 random datapoints
df.sample(5)

(3192, 2)

Unnamed: 0,Sentence,Sentiment
2736,$FB & $TSLA cracking lower early. $short #corr...,negative
3190,The announcement comes two weeks before a key ...,negative
2262,Aldi and Lidl expansion plans speed ahead as T...,positive
510,EUR 220 million of the transaction considerati...,neutral
582,Johnson Matthey revs up on clean air drive,positive


In [4]:
# preprocess documents
# remove special characters, stopwords
# lemmatization
clean_sentence = preprocess_corpus(df.Sentence)

clean_sentence.head()

0    upm kymmene one world lead print paper produce...
1        nokia pct eur kick morning negative territory
2    vasantha appoint manage director incap contrac...
3    consolidated net sale increase reach eur opera...
4    cabot export production mainly goodyear bridge...
Name: Sentence, dtype: object

In [5]:
# spawn a tfidf vectorizer
vectorizer = TfidfVectorizer(min_df=4, max_df=0.01)

# train and vectorize clean_sentence column
vectors = vectorizer.fit_transform(clean_sentence)

In [6]:
# extract tfidf vectors as dataframe
df_tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

# add Sentiment column to TF-IDF vector data
df_tfidf['Sentiment'] = df['Sentiment']

# shape
display(df_tfidf.shape)

# first 5 datapoints
df_tfidf.head()

(3192, 1355)

Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accumulate,acerta,across,...,write,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone,Sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral
