In [3]:
import numpy as np
import pandas as pd
import re, sys, os, csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
import string    

In [16]:
def prep(tweet):
    pattern = re.compile(r"(.)\1{2,}")
    tweet = pattern.sub(r"\1\1", tweet)
    tweet = re.sub(r'http.?://[^\s]+[\s]?', '', tweet)
    tweet = re.sub('\d+', '', tweet)
    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct) * ' ')  # Every punctuation symbol will be replaced by a space
    tweet = tweet.translate(trantab)
    tweet = tweet.lower()
    
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)     
        
    ps = PorterStemmer()
    words = tweet.split()
    stemmed_words = [ps.stem(word) for word in words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in stemmed_words]
    tweet = " ".join(lemma_words)
    return tweet

In [17]:
def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower())
dictionary = Counter(words(open('dataset/wordlists/merged.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

def fix_hashtag(text):
    text = text.group().split(":")[0]
    text = text[1:] # remove '#'
    try:
        test = int(text[0])
        text = text[1:]
    except:
        pass
    output = ' '.join(viterbi_segment(text)[0])
    #print(output)
    return output

def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub("(#[A-Za-z0-9]+)", fix_hashtag, tweet)
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

"""def remove_stopwords(word_list):
        filtered_tweet=""
        for word in word_list:
            word = word.lower() 
            if word not in stopwords.words("english"):
                filtered_tweet=filtered_tweet + " " + word
        
        return filtered_tweet.lstrip()"""
    
def de_repeat(tweet):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", tweet)

def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

def remove_urls(tweet):
    return re.sub(r'http.?://[^\s]+[\s]?', '', tweet)

def remove_punctuation(tweet):
    # Make translation table
    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct) * ' ')  # Every punctuation symbol will be replaced by a space
    return tweet.translate(trantab)

def whitespaces(tweet):
    tweet = tweet.strip()
    return tweet

def emoticon_punct(tweet):
    tweet = re.sub('<[^>]*.,!>', '', tweet)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', tweet)
    for emot in EMOTICONS:
        tweet = re.sub(u'(' + emot + ')', "_".join(EMOTICONS[emot].replace(",", " ").split()), tweet)
    return tweet
   
def emoji_toword(tweet):
    for emot in UNICODE_EMO:
        tweet = tweet.replace(emot, "_".join(UNICODE_EMO[emot].replace(",", " ").replace(":", " ").split()))
    return tweet

def remove_digits(tweet):
    return re.sub('\d+', '', tweet)

def to_lower(tweet):
    return tweet.lower()

def remove_stopwords(tweet):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = tweet.split()
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(clean_words)
    
def stemming(tweet):
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)     
        
    ps = PorterStemmer()
    words = tweet.split()
    stemmed_words = [ps.stem(word) for word in words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in stemmed_words]
    #porter = PorterStemmer()
    #words = input_text.split()
    #stemmed_words = [porter.stem(word) for word in words]
    return " ".join(lemma_words)
       
def vectorise_label(label):
    if label == "empty":return 1 # neutral
    elif label == "sadness":return 2 # sad
    elif label == "enthusiasm":return 3 # happy
    elif label == "neutral":return 0 # neutral
    elif label == "worry":return 4 # sad
    elif label == "surprise":return 5 # happy
    elif label == "love":return 6 # love
    elif label == "fun":return 7 # happy
    elif label == "hate":return 8 # anger
    elif label == "happiness":return 9 # happy
    elif label == "boredom":return 10 # neutral
    elif label == "relief":return 11 # happy
    elif label == "anger":return 12 #anger

In [18]:
data_train = pd.read_csv('dataset/data/text_emotion.csv', sep=',')
print("Dataset shape:",data_train.shape)
print(data_train.sentiment[0],":",data_train.content[0])

Dataset shape: (40000, 4)
empty : @tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =[


In [22]:
dataWriter = csv.writer(open('cleaned_data/data.csv', 'w'), delimiter=',',lineterminator="\n")

total = 40000
for i in range(40000):
    #print("Progress: ",round(i/total*100,2),"   ",end="\r")
    tweet= clean_tweet(data_train.content[i])
    #tweet = remove_stopwords(tweet.split())
    dataWriter.writerow([tweet, str(vectorise_label(data_train.sentiment[i]))])
    #sys.stdout.write("\033[F")
    
print("Progress: ",100,"\nComplete!")

Progress:  100 
Complete!


In [None]:
"""dataWriter = csv.writer(open('emotion_data_cleaned.csv', 'w', encoding = "utf-8"), delimiter=',',lineterminator="\n")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

total = 55773
for i in range(55773):
    tweet = prep(data_train.iloc[:,0][i]) 
    #tweet = remove_mentions(data_train.iloc[:,-1][i]) 
    #tweet = remove_urls(data_train.iloc[:,-1][i])
    #tweet = remove_digits(data_train.content[i])
    #tweet = remove_punctuation(data_train.content[i])
    #tweet = emoji_toword(data_train.content[i])
    #tweet = emoticon_punct(data_train.content[i])
    #tweet = to_lower(data_train.content[i])
    #tweet = whitespaces(data_train.content[i])
    #tweet = stemming(data_train.content[i])
    #tweet = remove_stopwords(data_train.content[i])
    
    dataWriter.writerow([tweet])
    #sys.stdout.write("\033[F")
    
print("Progress: ",100,"\nComplete!")"""

In [4]:
count = 0
with open('cleaned_data/data.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        count+=1

In [5]:
count

40000