In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
pd.options.mode.chained_assignment = None
import sys
sys.path.insert(1, '../predict')
from predict import predict
sys.path.insert(1, '../train')
from train_models import train_models
models=['tree']


## Baselines


In [2]:
#training on the dataset that has fake and true samples
train_models('../data/train.csv',models)
predict(models)

number of rows = (26000, 2)
***  Training tree ***
*** EVAL tree ***
acc fake=  0.4435
Performance worse than baseline by  0.0  percentual points


## ------------ Vader ------------

outputs a value between [-1 , 1]


In [2]:
models=['tree']
df = pd.read_csv('../data/train.csv')
df=df[['Text','Score']]
analyzer = SentimentIntensityAnalyzer()

df2 = df['Text'].apply(lambda txt: analyzer.polarity_scores(txt))

df2 = pd.json_normalize(df2)

df = pd.concat([df, df2], axis=1)
df.head(5)

Unnamed: 0,Text,Score,neg,neu,pos,compound
0,indian reside unite state india continue like ...,0,0.233,0.567,0.2,-0.7184
1,way back 1996 one airliner pilot use work give...,1,0.207,0.531,0.262,0.6954
2,schieder deliver semi believable part presiden...,0,0.083,0.786,0.131,0.4404
3,many movies around give feel like stardust thr...,0,0.025,0.578,0.397,0.9816
4,wakayama tomisaburo portrayal fugitive ex kais...,0,0.12,0.738,0.142,0.8074


# Change the cell bellow as you wish!!!
# You could try:
    1. remove rows with fake labels
    2. add more true rows
    3. change wrong labels
    4. etc...
    

In [3]:
#this is an example of what could be done
df_teste=df.copy()

df_teste.compound=df_teste.compound + 1  #convert it to the 0,2 interval

df_teste['Score']=df_teste['compound'].values.astype('int') #copy vader output to the labels

In [4]:
#run this cell to train and test the changes that you made 
#be carefull with the dataset save path
df_teste[['Text', 'Score']].to_csv('../data/vader.csv', index=False)
train_models('../data/vader.csv',models)
predict(models)


number of rows = (26000, 2)
***  Training tree ***
*** EVAL tree ***
acc fake=  0.3822
Nice this it better than baseline by  0.06130000000000002  percentual points :D


## ------------ TextBlob ------------
output between [-1 , 1 ]

In [5]:
import pandas as pd
from textblob import TextBlob
models=['tree']
df = pd.read_csv('../data/train.csv')
df=df[['Text','Score']]
df['sentiment'] = df['Text'].apply(lambda txt: TextBlob(txt).sentiment[0])
df.head(3)

Unnamed: 0,Text,Score,sentiment
0,indian reside unite state india continue like ...,0,0.1125
1,way back 1996 one airliner pilot use work give...,1,0.360417
2,schieder deliver semi believable part presiden...,0,0.214286


In [6]:
df_teste=df.copy()

#vader outputs value between -1 and 1
df_teste.sentiment=df_teste.sentiment + 1  #convert it to the 0,2 interval


df_teste['Score']=df_teste['sentiment'].values.astype('int')

In [7]:
df_teste[['Text', 'Score']].to_csv('../data/blob.csv', index=False)
train_models('../data/blob.csv',models)
predict(models)

number of rows = (26000, 2)
***  Training tree ***
*** EVAL tree ***
acc fake=  0.3582
Nice this it better than baseline by  0.08529999999999999  percentual points :D


## ------------ DISTILBERT ------------
output  
1. label -> {POSITIVE,NEGATIVE}  
2. score -> [0 , 1]

In [9]:
from happytransformer import HappyTextClassification
happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
12/13/2022 08:53:35 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [10]:
models=['tree']
df = pd.read_csv('../data/train.csv')
df=df[['Text','Score']]
df['pred']=df.Text.apply(lambda x: happy_tc.classify_text(x[:500]))
df['pred_label']=df.pred.apply(lambda x: 1 if x.label == 'POSITIVE' else 0)  #label that Bert predicted
df['pred_score']=df.pred.apply(lambda x: x.score)   #how confident bert is in that label
df_teste=df.copy()



In [11]:
df.Score = df.pred_label.copy()   #change tthis rule however you want

In [12]:
df[['Text', 'Score']].to_csv('../data/bert.csv', index=False)
train_models('../data/bert.csv',models)
predict(models)


number of rows = (26000, 2)
***  Training tree ***
*** EVAL tree ***
acc fake=  0.3238
Nice this it better than baseline by  0.11970000000000003  percentual points :D


## RegexpTokenizer

In [13]:
from nltk.tokenize import RegexpTokenizer

tokeniser = RegexpTokenizer(r"\w+")

df = pd.read_csv('../data/train.csv')
df=df[['Text','Score']]
df1 = df[df['Score'] == 1.0]
df0 = df[df['Score'] == 0]



N = 200
#top 200 words for each label
top100_1 = pd.Series(' '.join(df1['Text']).split()).value_counts()[ 
    :N].index.tolist()                                               #see how common each word is in each label
top100_0 = pd.Series(' '.join(df0['Text']).split()).value_counts()[
    :N].index.tolist()



In [14]:
df1['confidence'] = df1.Text.apply(lambda x: len(
    list(set(tokeniser.tokenize(x)) & set(top100_1)))/N).copy() # see how ratio between how many words in each sentence are int the top 200 words for that label 'confidence'

df0['confidence'] = df0.Text.apply(lambda x: len(
    list(set(tokeniser.tokenize(x)) & set(top100_0)))/N).copy()

df = pd.concat([df1, df0])

df.head()

Unnamed: 0,Text,Score,confidence
1,way back 1996 one airliner pilot use work give...,1,0.165
6,try use word describe saw original good way ba...,1,0.22
7,one timely engross documentaries ever watch st...,1,0.13
8,quite possibly retard 80 slasher ever realize ...,1,0.255
9,catch film outfest screen los angeles july 200...,1,0.1


In [15]:
df_teste = df[df['confidence'] > 0.05]

df_teste[['Text', 'Score']].to_csv('../data/try_toN.csv', index=False)

models=['tree']
train_models('../data/try_toN.csv',models)
predict(models)

number of rows = (25021, 2)
***  Training tree ***
*** EVAL tree ***
acc fake=  0.4482
Performance worse than baseline by  -0.004699999999999982  percentual points
