# Sentiment datasets

In [1]:
# Imports

import numpy as np
import pandas as pd 

In [2]:
FOLDER = "../Dataset-MTSA/"

In [3]:
class DataReader:
    def __init__(self, task_a="../Dataset-OLID/OLIDv1.0/data_subtask_a.csv"):
        self.task_a = task_a
        
    def get_df_train_data(self):
        train_data = pd.read_csv(self.task_a)
        train_tweets = train_data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_df_data(self, file="../Dataset-OLID/OLIDv1.0/data_subtask_a.csv", 
                   column_names_list = ["tweet", "label_a"]):
        data = pd.read_csv(file)
        train_tweets = data[column_names_list]
        #train_tweets = data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_np_data_and_labels(self, 
                               file="../Dataset-OLID/OLIDv1.0/data_subtask_a.csv", 
                              column_names_list = ["tweet", "label_a"]):
        tweets = self.get_df_data(file, column_names_list)
        data, labels = tweets.values[:,0], tweets.values[:,1]
        return data, labels
    
    # this creates copies
    def shuffle_np(self, data, labels):
        assert len(data) == len(labels)
        p = np.random.permutation(len(data))
        return data[p], labels[p]

## Dodavanje svima label_a dijela

In [13]:
#MTSA_files = ["tweets_test.tsv", "tweets_train.tsv", 
#              "tweets_train_20.tsv", "tweets_train_80.tsv"]

MTSA_files = ["tweets_consensus.tsv", "tweets_50-50.tsv"]

for file in MTSA_files: 
    temp_data = pd.read_csv(FOLDER + file, encoding='utf-8', sep="\t")
    temp_data.sentiment = pd.Categorical(temp_data.sentiment, ordered = True,
                                    categories=["NOT", "NEG"])
    temp_data['label_a'] = temp_data.sentiment.cat.codes
    temp_data.to_csv(FOLDER + file[:-4] + ".csv")

In [11]:
test = pd.read_csv(FOLDER + "tweets_consensus.csv")
test.head(20)

Unnamed: 0.1,Unnamed: 0,id,sentiment,tweet,label_a
0,0,0,NOT,"Finished Season one, now starting #StrangerThings",0
1,1,1,NOT,If NBA playoffs started today Cavs wouldn't ma...,0
2,2,2,NOT,My boyfriend knew I wasn’t gonna have lunch be...,0
3,3,3,NOT,@cheetah_spotty Ahh! I've always referred to t...,0
4,4,4,NOT,@mfluder_42 @StarMinion We’re thinking dinner ...,0
5,5,5,NOT,"Any man who must say, I am the king, is no tru...",0
6,6,6,NEG,🤦🏽‍♀️🤦🏽‍♀️ the sheriff should have told someon...,1
7,7,7,NOT,#AHSCult and then some #StrangerThings,0
8,8,8,NOT,#thor was so great!,0
9,9,9,NOT,Anyone else peep the #ReaganBush84 sign in #St...,0


In [50]:
dr = DataReader()
tweets = dr.get_df_data(FOLDER + "tweets_train.csv")
tweets.head(15)

Unnamed: 0,tweet,label_a
0,@jaketapper It's funny because after all these...,0
1,I deserve the masdo album for making it throug...,0
2,@DaltStew @JKLsugi For pc or Xbox??? 😯,0
3,Our family is having thanksgiving dinner today...,0
4,@fuckimlatenow I’m social at all i’d probably ...,0
5,@Swakbro My team is in both xbox and ps4 bro.,0
6,@izzy_blue143 Rs. Sis we gotta go see Thor: Ra...,0
7,Y’all thought Niall wrote that whole album abo...,1
8,@ArtistofCrime -txt- Lunch and I will only pos...,0
9,Tomorrow’s my last beaver football game of my ...,0


In [51]:
x,y = dr.get_np_data_and_labels(FOLDER + "tweets_train.csv")
print(x.shape, y.shape)
print(x)
print(y)

(6650,) (6650,)
["@jaketapper It's funny because after all these months it is still one big nothing burger"
 'I deserve the masdo album for making it through this week without stabbing someone with my heels 👠'
 '@DaltStew @JKLsugi For pc or Xbox??? 😯' ...
 'Abby and I are taking a nap for breakfast'
 "Asked the fam to bring me home food on the way back home and they said there's food at the house BUT THEY WENT OUT TO EAT MAKES SENSE"
 'i need to catch up with andante soon aaaAaaAAAAaaa but shit man im so in the mood for movie marathoning these past few days']
[0 0 0 ... 0 1 1]


## Exploration

In [29]:
data = pd.read_csv(FOLDER + "extracted_tweets.tsv", sep="\t")
data.head(10)

Unnamed: 0,id,sentiment,tweet
0,0,POS,"Finished Season one, now starting #StrangerThings"
1,1,NEG,If NBA playoffs started today Cavs wouldn't ma...
2,2,POS,My boyfriend knew I wasn’t gonna have lunch be...
3,3,POS,@cheetah_spotty Ahh! I've always referred to t...
4,4,POS,@mfluder_42 @StarMinion We’re thinking dinner ...
5,5,NEG,"Any man who must say, I am the king, is no tru..."
6,6,NEG,🤦🏽‍♀️🤦🏽‍♀️ the sheriff should have told someon...
7,7,POS,#AHSCult and then some #StrangerThings
8,8,POS,#thor was so great!
9,9,POS,Anyone else peep the #ReaganBush84 sign in #St...


In [30]:
data.shape

(7113, 3)

In [31]:
test_data = pd.read_csv(FOLDER + "tweets_test.tsv", sep="\t")
test_data.head(10)

Unnamed: 0,id,sentiment,tweet
0,5688,POS,@Janet_Reid I need help getting my book to ma...
1,2270,POS,I had a snicker doodle for breakfast. #adulting
2,6452,POS,I slept good 2day I barely sleep any other day...
3,5692,NEG,I can’t believe hit folk singer Charles Manson...
4,6924,POS,@EA $60 triple A title with microtransactions ...
5,125,NEG,"Most Django projects really only need one app,..."
6,2489,POS,Bryce Mulder's first shot as a college basketb...
7,5666,POS,• he /always/ has some type of snack in classes
8,135,POS,@AscendFCU when are y’all gonna goin the andro...
9,5968,POS,I did over 3k more steps than what I was suppo...


In [32]:
test_data.shape

(463, 3)

In [33]:
train_data = pd.read_csv(FOLDER + "tweets_train.tsv", sep="\t")
train_data.head(10)

Unnamed: 0,id,sentiment,tweet
0,1213,POS,@jaketapper It's funny because after all these...
1,5415,POS,I deserve the masdo album for making it throug...
2,1540,POS,@DaltStew @JKLsugi For pc or Xbox??? 😯
3,777,POS,Our family is having thanksgiving dinner today...
4,3792,POS,@fuckimlatenow I’m social at all i’d probably ...
5,6299,POS,@Swakbro My team is in both xbox and ps4 bro.
6,5239,POS,@izzy_blue143 Rs. Sis we gotta go see Thor: Ra...
7,3931,NEG,Y’all thought Niall wrote that whole album abo...
8,1749,POS,@ArtistofCrime -txt- Lunch and I will only pos...
9,3027,POS,Tomorrow’s my last beaver football game of my ...


In [34]:
train_data.shape

(6650, 3)

In [38]:
pd.Categorical(test_data.sentiment)

[POS, POS, POS, NEG, POS, ..., POS, POS, POS, POS, NEG]
Length: 463
Categories (2, object): [NEG, POS]

In [44]:
test_data.sentiment = pd.Categorical(test_data.sentiment, ordered = True,
                                    categories=["POS", "NEG"])
test_data['label_a'] = test_data.sentiment.cat.codes
test_data.head(10)

Unnamed: 0,id,sentiment,tweet,label_a
0,5688,POS,@Janet_Reid I need help getting my book to ma...,0
1,2270,POS,I had a snicker doodle for breakfast. #adulting,0
2,6452,POS,I slept good 2day I barely sleep any other day...,0
3,5692,NEG,I can’t believe hit folk singer Charles Manson...,1
4,6924,POS,@EA $60 triple A title with microtransactions ...,0
5,125,NEG,"Most Django projects really only need one app,...",1
6,2489,POS,Bryce Mulder's first shot as a college basketb...,0
7,5666,POS,• he /always/ has some type of snack in classes,0
8,135,POS,@AscendFCU when are y’all gonna goin the andro...,0
9,5968,POS,I did over 3k more steps than what I was suppo...,0


In [22]:
dr = DataReader()
dr.get_df_data()

Unnamed: 0,tweet,label_a
0,@USER She should ask a few native Americans wh...,1
1,@USER @USER Go home you’re drunk!!! @USER #MAG...,1
2,Amazon is investigating Chinese employees who ...,0
3,"@USER Someone should'veTaken"" this piece of sh...",1
4,@USER @USER Obama wanted liberals &amp; illega...,0
...,...,...
13235,@USER Sometimes I get strong vibes from people...,1
13236,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,0
13237,@USER And why report this garbage. We don't g...,1
13238,@USER Pussy,1


In [26]:
x,y = dr.get_np_data_and_labels()

In [27]:
print(type(x))
print(x.shape)
x

<class 'numpy.ndarray'>
(13240,)


array(['@USER She should ask a few native Americans what their take on this is.',
       '@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL',
       'Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT',
       ..., "@USER And why report this garbage.  We don't give a crap.",
       '@USER Pussy',
       '#Spanishrevenge vs. #justice #HumanRights and #FreedomOfExpression #Spain is a  #fakedemocracy @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER #cddr #shameonSpain #WakeupEurope @USER URL'],
      dtype=object)

In [28]:
print(type(y))
print(y.shape)
y

<class 'numpy.ndarray'>
(13240,)


array([1, 1, 0, ..., 1, 1, 0], dtype=object)