In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
import re 
import copy
from sklearn import svm

In [None]:
depression= pd.read_csv('depression_tweets.csv')
non_depression = pd.read_csv('non_depression_tweets.csv')

In [None]:
depression['depression'] = True
non_depression['depression'] = False

df_tweets_depression = (
    depression
    .append(non_depression)
)

In [None]:
df_tweets_depression.head()

In [None]:
df_tweets_depression[~df_tweets_depression.isnull()].drop(['language', 'user_screen_name'], axis=1).sample(5)

## Remoção de mentions e RTs

In [None]:
regex_rt = r'RT @([\w]+):'
regex_mention = r'@([\w]+)'
is_rt = df_tweets_depression.text.str.contains(regex_rt, regex=True, na=False)
is_mention = df_tweets_depression.text.str.contains(regex_mention, regex=True, na=False) & ~df_tweets_depression.text.str.contains(regex_rt, regex=True, na=False)
rts = df_tweets_depression.text.str.replace(regex_rt, '')
mentions = rts.str.replace(regex_mention, '')
df_tweets_depression['text'] = mentions
df_tweets_depression['is_rt'] = is_rt
df_tweets_depression['is_mention'] = is_mention

In [None]:
len(df_tweets_depression[df_tweets_depression.is_mention])

In [None]:
len(df_tweets_depression[df_tweets_depression.is_rt])

In [None]:
df_tweets_depression.head()

## Remoção de links e \n

In [None]:
regex_link = r'https://t.co/([\w]+)\Z'
regex_n = r'\n'
links = df_tweets_depression.text.str.replace(regex_link, '')
has_link = df_tweets_depression.text.str.contains(regex_link, regex=True, na=False)
df_tweets_depression['has_link'] = has_link
df_tweets_depression['text'] = links.str.replace(regex_n, ' ')

In [None]:
len(df_tweets_depression[df_tweets_depression.has_link])

## Remoção de tweets em outras línguas
### O ideal seria traduzir os em inglês

In [None]:
len(df_tweets_depression)

In [None]:
df_tweets_depression.language.unique()

In [None]:
# df_tweets = df_tweets[(df_tweets.language == 'pt')]
# len(df_tweets)

## Letra minúscula

In [None]:
df_tweets_depression.dtypes

In [None]:
df_tweets_depression['text'] = (
    df_tweets_depression['text']
    .apply(lambda x: ' '.join(item.lower() for item in str(x).split()))
)

## Remover pontuações e caractéres especiais

In [None]:
regex_special = r'[^a-zA-Záéóãõç]+'
has_special_ch = df_tweets_depression.text.str.contains(regex_special, regex=True, na=False)
df_tweets_depression['has_special_ch'] = has_special_ch
df_tweets_depression['text'] = df_tweets_depression['text'].apply(lambda x: ' '.join(
    re.sub(regex_special, '', item) for item in x.split()))

## Remover stopwords

In [None]:
stop_words = stopwords.words('portuguese')
stop_words.extend(('pra', 'pras', 'pro', 'pros', 'q', '-', 'é', 'tá', 'ta', 'vai', 'to'))
df_tweets_depression['original_text'] = df_tweets_depression['text']
df_tweets_depression['text'] = df_tweets_depression['text'].apply(lambda x: ' '.join(
    item for item in x.split() if item.lower() not in stop_words))

In [None]:
df_tweets_depression['has_stopwords'] = df_tweets_depression.original_text != df_tweets_depression.text

In [None]:
len(df_tweets_depression[df_tweets_depression.has_stopwords])

## Remover palavras muito frequentes

In [None]:
# freq = pd.Series(' '.join(df_tweets_depression['text']).split()).value_counts()
# # less_freq = list(freq[freq < 3].index)
# more_freq = list(freq[freq > 8].index)
# df_tweets['text'] = df_tweets['text'].apply(lambda x: ' '.join(x for x in x.split() if x not in more_freq))

## Remover tweets sem texto

In [None]:
len(df_tweets_depression[df_tweets_depression.text == ''])

In [None]:
# df_tweets = df_tweets.dropna()
df_tweets_depression = df_tweets_depression[df_tweets_depression.text != '']

In [None]:
df_tweets_depression.head()

In [None]:
df_tweets_depression[~df_tweets_depression.depression].head()

## Remover nulos

In [None]:
len(df_tweets_depression)

In [None]:
len(df_tweets_depression.dropna(subset=['text', 'user_screen_name', 'depression']))

In [None]:
df_tweets_depression = df_tweets_depression.dropna(subset=['text', 'user_screen_name', 'depression'])

In [None]:
df_tweets_depression.head()

In [None]:
len(df_tweets_depression)

In [None]:
# df_tweets_depression.to_csv('clean_depression_tweets.csv', index=False)