In [3]:
# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
# Loading data
dfProcessed = pd.read_csv('IMDBReviewsCleanned.csv')
del dfProcessed['Unnamed: 0']
dfProcessed.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,petter matteis love in the time of money is a ...,positive


In [19]:
# Dividing data into matrix
X = dfProcessed['review'].copy()
y = dfProcessed['sentiment'].copy()
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.3, random_state=1234)

# OHE Embeddings

In [13]:
# Creating OHE object
OHEEmb = CountVectorizer(max_features=1000)
XOHE = OHEEmb.fit_transform(X)
XOHE

<50000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 4389879 stored elements in Compressed Sparse Row format>

In [18]:
# Printing dataframe
pd.DataFrame(XOHE.toarray(), columns=OHEEmb.get_feature_names_out())

Unnamed: 0,10,20,30,70s,80s,able,about,above,absolutely,across,...,yet,york,you,youll,young,your,youre,yourself,youve,zombie
0,0,0,0,0,0,0,1,0,0,0,...,0,0,2,1,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,2
4,0,0,0,0,0,0,2,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,1,0,0,0,0,1,0,0,0,...,0,0,2,0,0,1,0,3,0,0
49996,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
49997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,0,0,1,0,0,0,2,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [20]:
# Applying to train and test
XTrainOHE = OHEEmb.transform(XTrain)
XTestOHE = OHEEmb.transform(XTest)
XTrainOHE

<35000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 3075360 stored elements in Compressed Sparse Row format>

In [21]:
# Printing dataframe trainOHE
pd.DataFrame(XTrainOHE.toarray(), columns=OHEEmb.get_feature_names_out())

Unnamed: 0,10,20,30,70s,80s,able,about,above,absolutely,across,...,yet,york,you,youll,young,your,youre,yourself,youve,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34995,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,1,0,0,0,0,0
34996,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34997,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
34998,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


# tf-idf embeddings

In [22]:
# Creating tf object
tfidfEmb = TfidfVectorizer(max_features=1000)
XTF = tfidfEmb.fit_transform(X)
XTF

<50000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4389879 stored elements in Compressed Sparse Row format>

In [23]:
# Printing dataframe
pd.DataFrame(XTF.toarray(), columns=tfidfEmb.get_feature_names_out())

Unnamed: 0,10,20,30,70s,80s,able,about,above,absolutely,across,...,yet,york,you,youll,young,your,youre,yourself,youve,zombie
0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.035092,0.0,0.0,0.000000,...,0.0,0.000000,0.061796,0.076224,0.00000,0.051629,0.000000,0.000000,0.0,0.000000
1,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.053362,0.0,0.0,0.000000,...,0.0,0.000000,0.046984,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,0.000000
2,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.10469,0.000000,0.000000,0.000000,0.0,0.000000
3,0.101961,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.047362,0.000000,0.00000,0.000000,0.107727,0.000000,0.0,0.318851
4,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.075729,0.0,0.0,0.000000,...,0.0,0.095449,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.000000,0.108969,0.000000,0.0,0.0,0.0,0.044945,0.0,0.0,0.000000,...,0.0,0.000000,0.079147,0.000000,0.00000,0.066126,0.000000,0.311113,0.0,0.000000
49996,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.060886,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,0.000000
49997,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,0.000000
49998,0.000000,0.000000,0.133286,0.0,0.0,0.0,0.102823,0.0,0.0,0.118015,...,0.0,0.000000,0.045267,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,0.000000


In [24]:
# Applying to train and test
XTrainTF = tfidfEmb.transform(XTrain)
XTestTF = tfidfEmb.transform(XTest)
XTrainTF

<35000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 3075360 stored elements in Compressed Sparse Row format>

In [25]:
# Printing dataframe trainTF
pd.DataFrame(XTrainTF.toarray(), columns=tfidfEmb.get_feature_names_out())

Unnamed: 0,10,20,30,70s,80s,able,about,above,absolutely,across,...,yet,york,you,youll,young,your,youre,yourself,youve,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.083484,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.031642,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.057492,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34995,0.0,0.0,0.0,0.0,0.0,0.0,0.120312,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.104144,0.0,0.0,0.0,0.0,0.0
34996,0.0,0.0,0.0,0.0,0.0,0.0,0.047192,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
34997,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.092782,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
34998,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.046499,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [27]:
# Exporting dataframes
pd.DataFrame(XTrainOHE.toarray(), columns=OHEEmb.get_feature_names_out()).to_csv('XTrainOHE.csv')
pd.DataFrame(XTrainTF.toarray(), columns=tfidfEmb.get_feature_names_out()).to_csv('XTrainTF.csv')
pd.DataFrame(XTestOHE.toarray(), columns=OHEEmb.get_feature_names_out()).to_csv('XTestOHE.csv')
pd.DataFrame(XTestTF.toarray(), columns=tfidfEmb.get_feature_names_out()).to_csv('XTestTF.csv')
yTrain.to_csv('yTrain.csv')
yTest.to_csv('yTest.csv')