# Data Increasing

El siguiente notebook presenta el proceso de incremento y limpieza de nuevos datos de texto. Igualmente, al final se realizará la unión con los 65 mil registos de texto existentes y ya procesados.

## Carga de nuevos datos

Para almacenar los datos que se van a ir cargando y limpiando, vamos a emplear un diccionario con los nombres de los archivos asociados a su dataset limpio.

In [1]:
import pandas as pd

DATA_DIR = "../data/text/raw/"
cleaned_data = {}

### Anon Disorders

OJO: La siguientes celdas demoran en ejecutar porque manipula un archivo de 4GB

In [4]:
df_anon_disorder = pd.read_csv(DATA_DIR + "anon_disorder_tweets.csv")
df_anon_disorder.head()

Unnamed: 0.1,Unnamed: 0,user_id,text,disorder
0,0,3f21058fc8,@amirulmokhtar Exactly! We were busy buat untu...,anxiety
1,1,3f21058fc8,Ternangis baca text &amp; dengar call reply di...,anxiety
2,2,3f21058fc8,I learn smthg very valuable today.\n\nWhen I w...,anxiety
3,3,3f21058fc8,"@MintKr Ohhh okay. So, to create more opportun...",anxiety
4,4,3f21058fc8,@MintKr Perspective expanding experience tu ap...,anxiety


In [6]:
df_anon_disorder = df_anon_disorder.drop(columns=['Unnamed: 0', 'user_id'], axis=1)
df_anon_disorder.columns

Index(['text', 'disorder'], dtype='object')

In [7]:
df_anon_disorder.rename(columns={'disorder': 'priority'}, inplace=True)
print(list(df_anon_disorder['priority'].unique()))
df_anon_disorder['priority'] = df_anon_disorder['priority'].replace(
    list(df_anon_disorder['priority'].unique()), 1)

['anxiety', 'depression', 'ptsd', 'borderline', 'panic', 'bipolar']


TypeError: 'tuple' object is not callable

In [11]:
df_anon_disorder.dropna(inplace=True)
df_anon_disorder.shape

(32191368, 2)

In [12]:
cleaned_data['anon_disorder_tweets.csv'] = df_anon_disorder

### Anxious_Tweets

In [2]:
df_anxious = pd.read_csv(DATA_DIR + "Anxious_Tweets.csv")
df_anxious['priority'] = [1 for i in range(df_anxious.shape[0])]
df_anxious.rename(columns={'0': 'text'}, inplace=True)
df_anxious.dropna(inplace=True)
df_anxious.drop(columns=['Unnamed: 0'], inplace=True)
df_anxious.shape

(8460, 2)

In [3]:
cleaned_data['Anxious_Tweets.csv'] = df_anxious

### Dataset Students Anxiety and depression

In [4]:
df_dataset = pd.read_excel(DATA_DIR + "dataset.xlsx")
df_dataset.rename(columns={'label':'priority'}, inplace=True)
df_dataset.dropna(inplace=True)
df_dataset.shape

(6970, 2)

In [5]:
cleaned_data['dataset.xlsx'] = df_dataset

### Lonely_Tweets

In [6]:
df_lonely = pd.read_csv(DATA_DIR + "Lonely_Tweets.csv")
df_lonely['priority'] = [1 for i in range(df_lonely.shape[0])]
df_lonely.rename(columns={'0': 'text'}, inplace=True)
df_lonely.dropna(inplace=True)
df_lonely.drop(columns=['Unnamed: 0'], inplace=True)
print(df_lonely.shape)
cleaned_data['Lonely_Tweets.csv'] = df_lonely

(8460, 2)


### mental_health

In [7]:
df_mental_health = pd.read_csv(DATA_DIR + "mental_health.csv")
df_mental_health.rename(columns={'label':'priority'}, inplace=True)
df_mental_health.dropna(inplace=True)
df_mental_health.shape

(27977, 2)

In [8]:
cleaned_data['mental_health.csv'] = df_mental_health

### Normal_Tweets

In [9]:
df_normal = pd.read_csv(DATA_DIR + "Normal_Tweets.csv")
df_normal['priority'] = [0 for i in range(df_normal.shape[0])]
df_normal.rename(columns={'cleaned_text': 'text'}, inplace=True)
df_normal.dropna(inplace=True)
df_normal.drop(columns=['Unnamed: 0'], inplace=True)
print(df_normal.shape)
cleaned_data['Normal_Tweets.csv'] = df_normal

(9925, 2)


### p_n_n_dataset

In [24]:
df_pn = pd.read_csv(DATA_DIR + "p_n_n_dataset.csv")
df_pn = df_pn[['tweet', 'label']]
df_pn['label'] = df_pn['label'].replace(['negative'], 1)
df_pn['label'] = df_pn['label'].replace(['neutral', 'positive'], 0)
df_pn.rename(columns={'label':'priority', 'tweet': 'text'}, inplace=True)
df_pn.dropna(inplace=True)
df_pn.shape

(174644, 2)

In [25]:
cleaned_data['p_n_n_dataset.csv'] = df_pn

### Stressed_Tweets

In [12]:
df_stressed = pd.read_csv(DATA_DIR + "Stressed_Tweets.csv")
df_stressed['priority'] = [1 for i in range(df_stressed.shape[0])]
df_stressed.rename(columns={'cleaned_text': 'text'}, inplace=True)
df_stressed.dropna(inplace=True)
print(df_stressed.shape)
cleaned_data['Stressed_Tweets.csv'] = df_stressed

(8535, 2)


### Sentiment140 - 1,6M

In [13]:
df_sentiment = pd.read_csv(DATA_DIR + "training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
cols = ['priority', 'text']
df_sentiment = df_sentiment[[0, 5]]
df_sentiment.columns= ['priority', 'text']
df_sentiment.dropna(inplace=True)
print(df_sentiment.shape)
#  (0 = negative, 2 = neutral, 4 = positive)
df_sentiment['priority'] = df_sentiment['priority'].replace([0], 1)
df_sentiment['priority'] = df_sentiment['priority'].replace([2, 4], 0)

(1600000, 2)


In [14]:
cleaned_data['training.1600000.processed.noemoticon.csv'] = df_sentiment

### Anon_Controls

In [56]:
df_anon_control = pd.read_csv(DATA_DIR + "anon_control_tweets.csv")

  df_anon_control = pd.read_csv(DATA_DIR + "anon_control_tweets.csv")


In [57]:
df_anon_control = df_anon_control.drop(columns=['Unnamed: 0', 'user_id'], axis=1)
print(df_anon_control['disorder'].unique())

['control' nan]


In [58]:
df_anon_control.rename(columns={'disorder': 'priority'}, inplace=True)
df_anon_control['priority'] = df_anon_control['priority'].replace(list(df_anon_control['priority'].unique()), 0)

['control', nan]


In [59]:
df_anon_control.dropna(inplace=True)
df_anon_control.shape

(31930059, 2)

In [61]:
cleaned_data['anon_control_tweets.csv'] = df_anon_control

## Loading previous data

In [15]:
prev_data = pd.read_csv("../data/text/cleaned/out.csv")
prev_data.rename(columns={'label': 'priority'}, inplace=True)

In [16]:
prev_data['priority'] = prev_data['priority'].replace([0, 3, 4], 'high')
prev_data['priority'] = prev_data['priority'].replace([1, 2, 5, 6], 'low')
prev_data['priority'] = prev_data['priority'].replace(['high', 'low'], [1, 0])
cleaned_data['out.csv'] = prev_data

In [17]:
prev_data['priority'].unique()

array([1, 0], dtype=int64)

## Apilar todos los datasets

In [26]:
for file in cleaned_data.keys():
    print(file)
    df = cleaned_data[file]
    df['priority'] = df['priority'].astype('int8')

Anxious_Tweets.csv
dataset.xlsx
Lonely_Tweets.csv
mental_health.csv
Normal_Tweets.csv
p_n_n_dataset.csv
Stressed_Tweets.csv
training.1600000.processed.noemoticon.csv
out.csv


In [27]:
final_dataset = pd.concat(list(cleaned_data.values()), axis=0, ignore_index=True)

In [28]:
final_dataset.shape

(1914960, 2)

## Export Final Dataset

In [29]:
from pathlib import Path

filepath = Path('../data/text/cleaned/reduced_final.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
final_dataset.to_csv(filepath, index=False)