In [1]:
import numpy as np
import pandas as pd
import itertools
import glob
import re
from sklearn.model_selection import train_test_split

### Move original to dev/test

In [24]:
old_data = glob.glob("old_data/*.csv")
data = pd.DataFrame()

for file in old_data:
    data = pd.concat([data, pd.read_csv(file, header=None)])

In [26]:
data[0].value_counts()

0    582
1     79
Name: 0, dtype: int64

In [15]:
train, test = train_test_split(data, test_size=450)
dev, test = train_test_split(test, test_size=300)
train.shape, dev.shape, test.shape

((211, 2), (150, 2), (300, 2))

In [17]:
data_path = "../../../data/sentiment/el/"

train.to_csv(data_path + "train.csv", header=False, index=False)
dev.to_csv(data_path + "dev.csv", header=False, index=False)
test.to_csv(data_path + "test.csv", header=False, index=False)

### Reviews

In [19]:
df = pd.read_csv("mobile_phones_review.tsv", sep="\t", skiprows=[2461, 4729])
df = df[df["star"] != 3]
df["sentiment"] = (df["star"] > 3).astype(int)
df = df.sample(200)

In [20]:
data_path = "../../../data/sentiment/el/"

train = pd.read_csv(data_path + "train.csv", header=None)
train = pd.concat([train, df[["sentiment", "review"]].rename(columns={"sentiment": 0, "review": 1})])

In [21]:
train.shape, train[0].mean()

((411, 2), 0.5304136253041363)

In [22]:
train.to_csv(data_path + "train.csv", header=False, index=False)

### Tweets

In [5]:
def preprocess_tweets():
    df1 = pd.read_excel("RatedTweets.xlsx")
    df1 = df1[df1.columns[:12].tolist() + [df1.columns[-1]]]
    emotions = df1.iloc[0, :6].values
    emotion_headers = np.array(list(itertools.product(["rater1_", "rater2_"], emotions)), dtype=object).sum(axis=1)
    df1 = df1.drop(0)
    df2 = pd.read_excel("RatedTweets_lexcreation.xls").rename(columns={"Unnamed: 12": "Tweet"})
    df2 = df2.drop(0)
    df = pd.concat([df1, df2])
    df.columns = emotion_headers.tolist() + df.columns[12:].tolist()
    
    new_df = df[["Tweet"]].copy()
    
    for i in range(len(emotions)):
        mean = np.mean([df.iloc[:,i], df.iloc[:,i+6]], axis=0)
        emotion = df.columns[i].split("_")[-1]
        new_df[emotion] = mean.astype(float)
    
    new_df["main_emotion"] = new_df.iloc[:,1:].idxmax(axis=1)
    
    positive_condition = (new_df["main_emotion"] == "happiness") & (new_df["happiness"] >= 3.5)
    negative_condition = (
        ((new_df["main_emotion"] == "anger") & (new_df["anger"] >= 3.5)) | 
        ((new_df["main_emotion"] == "disgust") & (new_df["disgust"] >= 3.5)) |
        ((new_df["main_emotion"] == "sadness") & (new_df["sadness"] >= 3.5))
    )

    new_df = new_df[(positive_condition | negative_condition)]
    new_df["sentiment"] = (new_df["main_emotion"] == "happiness").astype(int)
    
    return new_df.dropna(subset=["Tweet"])

In [None]:
new_df = preprocess_tweets()

Mix with original

In [132]:
old_data = glob.glob("old_data/*.csv")
data = pd.DataFrame()

for file in old_data:
    data = pd.concat([data, pd.read_csv(file, header=None)])
new_df = new_df[["sentiment", "Tweet"]].rename(columns={"sentiment": 0, "Tweet": 1})
data = pd.concat([data, new_df])
data = data.dropna(subset=[1])

In [133]:
train, test = train_test_split(data, test_size=0.3)
dev, test = train_test_split(test, test_size=2/3)
train.shape, dev.shape, test.shape

((847, 2), (121, 2), (242, 2))

In [134]:
data_path = "../../../data/sentiment/el/"

train.to_csv(data_path + "train.csv", header=False, index=False)
dev.to_csv(data_path + "dev.csv", header=False, index=False)
test.to_csv(data_path + "test.csv", header=False, index=False)

### All datasets

Original

In [2]:
old_data = glob.glob("old_data/*.csv")
data = pd.DataFrame()

for file in old_data:
    data = pd.concat([data, pd.read_csv(file, header=None)])

In [3]:
train, test = train_test_split(data, test_size=450)
dev, test = train_test_split(test, test_size=300)
train.shape, dev.shape, test.shape

((211, 2), (150, 2), (300, 2))

Tweets

In [6]:
data = preprocess_tweets()
data = data[["sentiment", "Tweet"]].rename(columns={"sentiment": 0, "Tweet": 1})
data[1] = data[1].apply(lambda x: re.sub(r"(RT )?@[a-zA-Z_]+:?", "", x).strip())
data = data.drop_duplicates()

In [7]:
train_tweets, test_tweets = train_test_split(data, test_size=250)
dev_tweets, test_tweets = train_test_split(test_tweets, test_size=2/3)

train_tweets.shape, dev_tweets.shape, test_tweets.shape

((269, 2), (83, 2), (167, 2))

Phone reviews

In [12]:
df = pd.read_csv("mobile_phones_review.tsv", sep="\t", skiprows=[2461, 4729])
df = df[df["star"] != 3]
df["sentiment"] = (df["star"] > 3).astype(int)
df = df.sample(1500)
df = df[["sentiment", "review"]]
df.columns = [0, 1]

In [13]:
train_reviews, test_reviews = train_test_split(df, test_size=0.3)
dev_reviews, test_reviews = train_test_split(test_reviews, test_size=2/3)

train_reviews.shape, dev_reviews.shape, test_reviews.shape

((1050, 2), (150, 2), (300, 2))

Join all

In [15]:
train = pd.concat([train, train_tweets, train_reviews])
dev = pd.concat([dev, dev_tweets, dev_reviews])
test = pd.concat([test, test_tweets, test_reviews])

train.shape, dev.shape, test.shape

((1530, 2), (383, 2), (767, 2))

In [16]:
data_path = "../../../data/sentiment/el/"

train.to_csv(data_path + "train.csv", header=False, index=False)
dev.to_csv(data_path + "dev.csv", header=False, index=False)
test.to_csv(data_path + "test.csv", header=False, index=False)