In [1]:
# import required packages
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# load dataset into dataframe
df = pd.read_csv('../../data/train_data.csv')

# dataset shape: (rows, columns)
display(df.shape)

# 10 random datapoints
df.sample(10)

(3192, 2)

Unnamed: 0,clean_sentence,Sentiment
2367,outotec scope delivery cover engineering suppl...,neutral
1054,nokian tyre prof high safety excellently impor...,positive
1878,corrensponds percent okmetic share capital vot...,neutral
2456,london afx fortum said agreed sell industrial ...,neutral
2787,third quarter net sale increased eur operating...,positive
2874,pretax profit decreased eur eur fourth quarter,negative
1821,binkster lol looking correlation aria foreign ...,positive
2088,tieto offer aktia good foundation required sup...,positive
431,covered small mww short loss flat day cash,positive
2864,konecranes previously communicated estimated r...,negative


In [4]:
# check if null value exist
display(df.clean_sentence.isna().sum())

# replace null values with empty string
df.clean_sentence = df.clean_sentence.fillna('')

# verify null count
display(df.clean_sentence.isna().sum())

1

0

In [5]:
# spawn a tfidf vectorizer
vectorizer = TfidfVectorizer()

# train and vectorize clean_sentence column
vectors = vectorizer.fit_transform(df['clean_sentence'])

In [6]:
# extract tfidf vectors as dataframe
df_tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

# add Sentiment column to TF-IDF vector data
df_tfidf['Sentiment'] = df['Sentiment']

# shape
display(df_tfidf.shape)

# first 5 datapoints
df_tfidf.head()

(3192, 6683)

Unnamed: 0,aal,aaland,aalto,aapl,aaron,aava,abb,abbott,abbv,aberdeen,...,zain,zainalabedin,zanadvorov,zao,zinc,zloty,znga,zoltan,zone,Sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral
