In [9]:
import pandas as pd
import numpy as np
from transformers import pipeline
import pickle
from functions_variables import *
from pandas import json_normalize

In [3]:
# Load the objects from the pickle file
with open('../data/imdb_dataset.pkl', 'rb') as f:
    data = pickle.load(f)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [4]:
df_train = pd.DataFrame(data['train'])
df_train['text_no_punct'] = df_train['text'].apply(remove_punctuation)
df_train['text_no_stop'] = df_train['text_no_punct'].apply(remove_stopwords_text)
# Strings with greater than 512 characters will not work therefore need to truncate the string
df_train['text_no_stop_trun'] = df_train['text_no_stop'].apply(truncate_string)
df_train

Unnamed: 0,text,label,text_no_punct,text_no_stop,text_no_stop_trun
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,i rented i am curiousyellow from my video stor...,rented curiousyellow video store controversy s...,rented curiousyellow video store controversy s...
1,"""I Am Curious: Yellow"" is a risible and preten...",0,i am curious yellow is a risible and pretentio...,curious yellow risible pretentious steaming pi...,curious yellow risible pretentious steaming pi...
2,If only to avoid making this type of film in t...,0,if only to avoid making this type of film in t...,avoid making type film future film interesting...,avoid making type film future film interesting...
3,This film was probably inspired by Godard's Ma...,0,this film was probably inspired by godards mas...,film probably inspired godards masculin fémini...,film probably inspired godards masculin fémini...
4,"Oh, brother...after hearing about this ridicul...",0,oh brotherafter hearing about this ridiculous ...,oh brotherafter hearing ridiculous film umptee...,oh brotherafter hearing ridiculous film umptee...
...,...,...,...,...,...
24995,A hit at the time but now better categorised a...,1,a hit at the time but now better categorised a...,hit time better categorised australian cult fi...,hit time better categorised australian cult fi...
24996,I love this movie like no other. Another time ...,1,i love this movie like no other another time i...,love movie like another time try explain virtu...,love movie like another time try explain virtu...
24997,This film and it's sequel Barry Mckenzie holds...,1,this film and its sequel barry mckenzie holds ...,film sequel barry mckenzie holds two greatest ...,film sequel barry mckenzie holds two greatest ...
24998,'The Adventures Of Barry McKenzie' started lif...,1,the adventures of barry mckenzie started life ...,adventures barry mckenzie started life satiric...,adventures barry mckenzie started life satiric...


In [5]:
# Tasks summary https://huggingface.co/docs/transformers/main/en/task_summary
classifier = pipeline(model='distilbert-base-uncased-finetuned-sst-2-english', task='sentiment-analysis', device=0)

In [6]:
data = df_train.iloc[0]['text_no_stop']
preds = classifier(data)
preds

[{'label': 'NEGATIVE', 'score': 0.9853844046592712}]

In [10]:
df_train['sentiment'] = df_train['text_no_stop_trun'].apply(lambda x: get_sentiment(classifier, x))

In [11]:
df_train

Unnamed: 0,text,label,text_no_punct,text_no_stop,text_no_stop_trun,sentiment
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,i rented i am curiousyellow from my video stor...,rented curiousyellow video store controversy s...,rented curiousyellow video store controversy s...,"[{'label': 'NEGATIVE', 'score': 0.979001402854..."
1,"""I Am Curious: Yellow"" is a risible and preten...",0,i am curious yellow is a risible and pretentio...,curious yellow risible pretentious steaming pi...,curious yellow risible pretentious steaming pi...,"[{'label': 'NEGATIVE', 'score': 0.999213457107..."
2,If only to avoid making this type of film in t...,0,if only to avoid making this type of film in t...,avoid making type film future film interesting...,avoid making type film future film interesting...,"[{'label': 'NEGATIVE', 'score': 0.998963952064..."
3,This film was probably inspired by Godard's Ma...,0,this film was probably inspired by godards mas...,film probably inspired godards masculin fémini...,film probably inspired godards masculin fémini...,"[{'label': 'POSITIVE', 'score': 0.889508426189..."
4,"Oh, brother...after hearing about this ridicul...",0,oh brotherafter hearing about this ridiculous ...,oh brotherafter hearing ridiculous film umptee...,oh brotherafter hearing ridiculous film umptee...,"[{'label': 'NEGATIVE', 'score': 0.995330333709..."
...,...,...,...,...,...,...
24995,A hit at the time but now better categorised a...,1,a hit at the time but now better categorised a...,hit time better categorised australian cult fi...,hit time better categorised australian cult fi...,"[{'label': 'NEGATIVE', 'score': 0.992492377758..."
24996,I love this movie like no other. Another time ...,1,i love this movie like no other another time i...,love movie like another time try explain virtu...,love movie like another time try explain virtu...,"[{'label': 'NEGATIVE', 'score': 0.801946938037..."
24997,This film and it's sequel Barry Mckenzie holds...,1,this film and its sequel barry mckenzie holds ...,film sequel barry mckenzie holds two greatest ...,film sequel barry mckenzie holds two greatest ...,"[{'label': 'POSITIVE', 'score': 0.998673677444..."
24998,'The Adventures Of Barry McKenzie' started lif...,1,the adventures of barry mckenzie started life ...,adventures barry mckenzie started life satiric...,adventures barry mckenzie started life satiric...,"[{'label': 'NEGATIVE', 'score': 0.937912523746..."


In [11]:
with open('../data/pretrained_data.pkl', 'wb') as f:
    pickle.dump(df_train, f)

In [14]:
sent_exploded = df_train.explode('sentiment')
sent_normalized = json_normalize(sent_exploded['sentiment'])
sent_normalized

Unnamed: 0,label,score
0,NEGATIVE,0.979001
1,NEGATIVE,0.999213
2,NEGATIVE,0.998964
3,POSITIVE,0.889508
4,NEGATIVE,0.995330
...,...,...
24995,NEGATIVE,0.992492
24996,NEGATIVE,0.801947
24997,POSITIVE,0.998674
24998,NEGATIVE,0.937913


In [15]:
df_train[['sentiment_label', 'sentiment_accuracy']] = sent_normalized[['label','score']]

In [16]:
df_train

Unnamed: 0,text,label,text_no_punct,text_no_stop,text_no_stop_trun,sentiment,sentiment_label,sentiment_accuracy
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,i rented i am curiousyellow from my video stor...,rented curiousyellow video store controversy s...,rented curiousyellow video store controversy s...,"[{'label': 'NEGATIVE', 'score': 0.979001402854...",NEGATIVE,0.979001
1,"""I Am Curious: Yellow"" is a risible and preten...",0,i am curious yellow is a risible and pretentio...,curious yellow risible pretentious steaming pi...,curious yellow risible pretentious steaming pi...,"[{'label': 'NEGATIVE', 'score': 0.999213457107...",NEGATIVE,0.999213
2,If only to avoid making this type of film in t...,0,if only to avoid making this type of film in t...,avoid making type film future film interesting...,avoid making type film future film interesting...,"[{'label': 'NEGATIVE', 'score': 0.998963952064...",NEGATIVE,0.998964
3,This film was probably inspired by Godard's Ma...,0,this film was probably inspired by godards mas...,film probably inspired godards masculin fémini...,film probably inspired godards masculin fémini...,"[{'label': 'POSITIVE', 'score': 0.889508426189...",POSITIVE,0.889508
4,"Oh, brother...after hearing about this ridicul...",0,oh brotherafter hearing about this ridiculous ...,oh brotherafter hearing ridiculous film umptee...,oh brotherafter hearing ridiculous film umptee...,"[{'label': 'NEGATIVE', 'score': 0.995330333709...",NEGATIVE,0.995330
...,...,...,...,...,...,...,...,...
24995,A hit at the time but now better categorised a...,1,a hit at the time but now better categorised a...,hit time better categorised australian cult fi...,hit time better categorised australian cult fi...,"[{'label': 'NEGATIVE', 'score': 0.992492377758...",NEGATIVE,0.992492
24996,I love this movie like no other. Another time ...,1,i love this movie like no other another time i...,love movie like another time try explain virtu...,love movie like another time try explain virtu...,"[{'label': 'NEGATIVE', 'score': 0.801946938037...",NEGATIVE,0.801947
24997,This film and it's sequel Barry Mckenzie holds...,1,this film and its sequel barry mckenzie holds ...,film sequel barry mckenzie holds two greatest ...,film sequel barry mckenzie holds two greatest ...,"[{'label': 'POSITIVE', 'score': 0.998673677444...",POSITIVE,0.998674
24998,'The Adventures Of Barry McKenzie' started lif...,1,the adventures of barry mckenzie started life ...,adventures barry mckenzie started life satiric...,adventures barry mckenzie started life satiric...,"[{'label': 'NEGATIVE', 'score': 0.937912523746...",NEGATIVE,0.937913


NameError: name 'df_train' is not defined