# Initialasing pyspark

We will work with pyspark to better read the original txt files. We do not need a validation file, we will generate our own dataset and then divide it between train and test set at 75/25 proportion. 

In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext.getOrCreate()
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Creating one single dataframe

In [3]:
from pyspark.sql.types import *

In [4]:
data_names = ["train", "test", "val"]
schemaString = "idx tweet label"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
dfs = []
n_features = 100
ngrams = 2
print("Lenght of each set")
for data in data_names:
    text = sc.textFile('emotion/'+data+'_text.txt', use_unicode=True).map(lambda x: x.split('\n')).zipWithIndex()
    labels = sc.textFile('emotion/'+data+'_labels.txt', use_unicode=True).map(lambda x: x.split('\n')).zipWithIndex()
    t_idx = text.map(lambda x: (x[1], x[0][0]))
    l_idx = labels.map(lambda x: (x[1], x[0][0]))
    datasets = t_idx.join(l_idx).map(lambda x: (x[0], x[1][0], x[1][1]))
    df = spark.createDataFrame(datasets, schema).drop('idx')
    dfs.append(df)
    print(data+': ', df.count())
final = dfs[0].union(dfs[1]).union(dfs[2])
print("Final data: ", final.count())

Lenght of each set
train:  3257
test:  1421
val:  374
Final data:  5052


In [5]:
# Convert to pandas dataframe
final_pd = final.toPandas().sample(frac=1,random_state=1492)

In [6]:
# To see the imbalance of the data
final_pd.label.value_counts()

0    2118
3    1326
1    1163
2     445
Name: label, dtype: int64

## Saving the dataset

In [7]:
final_pd.to_csv('Data/all_tweets.csv', index=False)

## Generating train and test sets

In [8]:
from sklearn.model_selection import train_test_split
X = final_pd.tweet
y = final_pd.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1492, stratify=y)

In [9]:
X_train.shape, y_train.shape

((3789,), (3789,))

In [10]:
X_train.to_csv('Data/X_train.csv', index=False)
y_train.to_csv('Data/y_train.csv', index=False)
X_test.to_csv('Data/X_test.csv', index=False)
y_test.to_csv('Data/y_test.csv', index=False)