# Load database

This notebook will prepare a combined database using different sources of data. At the end, a .csv file will be created containg two columns: tweet- containing the text, ironic- containing the sentiment.

In [9]:
# General import and load data
import numpy as np
import nltk
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
import re

# Needed for running
nltk.download('punkt')
nltk.download('stopwords')

# Import database (mejicano and TASS)
df_noironicos=pd.read_csv('DATA/noironicos_bodies.csv')
df_train = pd.read_csv('DATA/train_data.csv')
df_development = pd.read_csv('DATA/development.csv')


# Encode categorical variable (ironic)
#df_noironicos.loc[df_noironicos['ironic']==True, "ironic"] = 1
#df_noironicos.loc[df_noironicos['ironic']==False, "ironic"] = 0
df_train.loc[df_train['sentiment/polarity/value']== 'NEU', 'sentiment/polarity/value'] = 0
df_train.loc[df_train['sentiment/polarity/value']== 'NONE', 'sentiment/polarity/value'] = 0
df_train.loc[df_train['sentiment/polarity/value']== 'N', 'sentiment/polarity/value'] = 1
df_train.loc[df_train['sentiment/polarity/value']== 'P', 'sentiment/polarity/value'] = 1
df_development.loc[df_development['sentiment'] == 'NONE', 'sentiment'] = 0
df_development.loc[df_development['sentiment'] == 'NEU', 'sentiment'] = 0
df_development.loc[df_development['sentiment'] == 'N', 'sentiment'] = 1
df_development.loc[df_development['sentiment'] == 'P', 'sentiment'] = 1



# Drop non-used columns
df_noironicos.drop(['id_tweet', 'depends_image', 'depends_link', 'depends_retweet'], axis=1, inplace=True)
df_train.drop(['tweetid', 'user', 'date', 'lang'], axis=1, inplace=True)
df_development.drop(['tweetid', 'user', 'date', 'lang'], axis=1, inplace=True)

#
# Drop nan rows
df_clean=df_noironicos.dropna(subset=['text'])
df_noironicos=df_clean

# Final dataset
df_noironicos

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,ironic,text
0,True,Algunas personas sufren en las discos mientras...
3,True,@jacevedoaraya es para sostener el marcador......
4,True,Alguna de estas imágenes te sacara una sonrisa...
5,True,@_Eurovision2014 en 2013 falta esdm jajajajaja...
6,True,Hooo que buen padre...#Sarcasmo #GH2015
7,True,"@JhoynerV ja ja ja ja ja así o más claro, cas..."
8,False,@patronbermudez con todo respeto lo principios...
11,True,Gran rapidez todo en la UPO y no iban a ser me...
12,True,¿Que humilde es Simeone no? #ironía #llorón #f...
13,True,¿Alguien se viene a la playa conmigo? #resfria...


In [10]:
# import more data from pickle. Additional datasets for balancing.
import pickle


# 1st dataset
train17 = open("DATA/CLASSIFIERS/train17.pickle", "rb")
train17 = pickle.load(train17)
for tweet in train17:
    del(tweet[0])
    del(tweet[0])

# 2nd dataset    
dev17 = open("DATA/CLASSIFIERS/dev17.pickle", "rb")
dev17 = pickle.load(dev17)
for tweet in dev17:
    del(tweet[0])
    del(tweet[0])

# 3rd dataset    
train15 = open("DATA/CLASSIFIERS/train15.pickle", "rb")
train15 = pickle.load(train15)
for tweet in train15:
    del(tweet[0])
    del(tweet[0])




In [11]:
# 4th dataset    
train15_2 = open("DATA/CLASSIFIERS/train15_2.pickle", "rb")
train15_2 = pickle.load(train15_2)
for tweet in train15_2:
    del(tweet[0])
    del(tweet[0])


In [12]:
len(train15_2)+ len(train15)+len(dev17)+len(train17)

13453

In [13]:
tot_lists = train17+dev17+train15+train15_2
len(tot_lists)

13453

In [14]:
# Merge all tweets and create a common dataframe
# df_train and df_development are the TASS datasets. df_noironicos is the mexican dataset containing ironic tweets

sentiment = []
field = []
i=0
for tweet in tot_lists:
    sentiment.append(tweet[1])
    field.append(tweet[0])
    i = i+1
    
# Create a list for every column in dataset
field_1 = df_noironicos['text'].tolist()
sentiment_1 = df_noironicos['ironic'].tolist()
sentiment_2 = df_development['sentiment'].tolist()
field_2 = df_development['content'].tolist()
field_3 = df_train['content'].tolist()
sentiment_3 = df_train['sentiment/polarity/value'].tolist()

# Merge lists and define dataset
field_complete = field_1+field_2+field_3+field
sentiment_complete = sentiment_1+sentiment_2+sentiment_3+sentiment
df = pd.DataFrame(
    {'tweet': field_complete,
     'ironic': sentiment_complete
    })

In [27]:
# COdify all non-numeric variables

df.loc[df['ironic'] == 'NONE', 'ironic'] = 0
df.loc[df['ironic'] == 'NEU', 'ironic'] = 0
df.loc[df['ironic'] == 'N', 'ironic'] = 2
df.loc[df['ironic'] == 'P', 'ironic'] = 2
df.loc[df['ironic'] == True, 'ironic'] = 1
df.loc[df['ironic'] == False, 'ironic'] = 0
df = df[df.ironic != 2]
df = df[df.ironic != 'N+']
df = df[df.ironic != 'P+']


In [28]:
# Save dataset

df.to_csv('final_dataset.csv', index = False)