In [1]:
import pandas as pd
import spacy
import re
import numpy as np
from sklearn import preprocessing 

In [2]:
train_df = pd.read_csv('data/big_train_test/train_data.csv')

In [3]:
train_df = train_df.dropna()

In [4]:
train_df = train_df[['content', 'sentiment']]

In [5]:
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'labels'. 
train_df['sentiment_labels']= label_encoder.fit_transform(train_df['sentiment']) 

In [6]:
train_df['sentiment_labels'].unique() 

array([ 2, 10,  3,  8, 12, 11,  7,  4,  6,  5,  1,  9,  0])

In [7]:
label_encoder.inverse_transform(train_df['sentiment_labels'].unique() )

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

train_df['sentiment'] = train_df['sentiment'].astype('category')
train_df['sentiment'] = train_df['sentiment'].cat.codes
dict( zip( train_df['sentiment'].cat.codes, train_df['sentiment'] ) )


In [8]:
train_df.groupby(['sentiment']).size()

sentiment
anger           94
boredom        154
empty          588
enthusiasm     486
fun           1008
happiness     2742
hate          1136
love          1893
neutral       5566
relief         967
sadness       4575
surprise      1495
worry         7027
dtype: int64

In [9]:
train_df = train_df[train_df.sentiment_labels != 0]
train_df = train_df[train_df.sentiment_labels != 1]
train_df = train_df[train_df.sentiment_labels != 2]
train_df = train_df[train_df.sentiment_labels != 3]

In [10]:
for i in range(-1,13):
    len_s = len(train_df.loc[train_df['sentiment_labels'] == i])
    if len_s > 1500:
        diff = len_s-1500
        to_remove = np.random.choice(train_df[train_df['sentiment_labels']==i].index,size=diff,replace=False)
        train_df = train_df.drop(to_remove)

In [11]:
train_df.groupby(['sentiment']).size()

sentiment
fun          1008
happiness    1500
hate         1136
love         1500
neutral      1500
relief        967
sadness      1500
surprise     1495
worry        1500
dtype: int64

## HELPING FUNCTIONS

In [18]:
import json
import re
from itertools import islice
from nltk.corpus import wordnet
wordnet.synsets("everyone")
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
json_path = 'data/text_alternatives.json'
with open(json_path, 'r') as file:
    json_data = json.load(file)
abb_json = json_data['abb_json']
smiley = json_data['smiley']
smiley_list = smiley.keys()

from itertools import groupby 
from string import punctuation
punc = set(punctuation)

subst_string = {'\'ve': ' have', '`ve': ' have', '\'s': '', '`s': '', 'n\'t': ' not', 
                'n`t': ' not', '\'ll': ' will', '`ll': ' will', '\'d': ' had', 
                '`d': ' hd', '\'re': ' are', '`re': ' are'}


def remove_cons_punct(text):
    newtext = []
    for k, g in groupby(text):
        if k in punc:
            newtext.append(k)
        else:
            newtext.extend(g)

    return ''.join(newtext)


def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]


smiley_list = list(chunks(list(smiley_list), 10))
abb_list = abb_json.keys()
abb_list = [nlp(text) for text in abb_list]
matcher.add('abb_list', None, *abb_list)
try:
    for each_list in smiley_list:
        each_list = [nlp(text) for text in each_list]
        matcher.add('smiley_list', None, *each_list)
except:
    print("exception occured while adding patterns")
    
    
def remove_hashtag_mentions(text):
    text = re.sub(r'((?<=^)([(@|#)][^  ]+)|(?<= )([(@|#)][^ ]+))', '', text)
    return text


def remove_html_texts(text):
    text = re.sub(r'(&.+?;)', '', text)
    return text


def remove_url(text):
    text = re.sub(r'https?://.*[\r\n]*', '', text)
    return text

def process_tweets_for_abb_smiley(text):
    text = re.sub(r'(?=([a-z])\1\1).', '', text)
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        if doc[start:end].text in abb_json.keys():
            text = text.replace(doc[start:end].text, abb_json[doc[start:end].text])
        if doc[start:end].text in smiley.keys():
            text = text.replace(doc[start:end].text, smiley[doc[start:end].text])
    return remove_cons_punct(text.lower())


def process_abbword_smiley_links_hashtags_mentions(text, remove_links_urls=True, remove_hashtag_and_mentions=True):
    text = process_tweets_for_abb_smiley(text)
    text = remove_html_texts(text)
    if remove_links_urls:
        text = remove_url(text)
    if remove_hashtag_and_mentions:
        text = remove_hashtag_mentions(text)
    return text

def get_string_removing_consequtive_similar_char(text):
    new_text_list = text.split(" ")
    new_string = ""
    for new_text in new_text_list:
        meaningful = []
        if len(wordnet.synsets(new_text)) ==0:
            consq_char = re.findall(r'([a-z])\1', new_text)
            consq_char_comb = get_char_combination(consq_char)
            for comb in consq_char_comb:
                pattern = r'(?=([' + re.escape(comb) + r'])\1).'
                new_word = re.sub(pattern, "", new_text)
                if wordnet.synsets(new_word):
                    meaningful.append(new_word)
            if meaningful:
                new_string += max(meaningful, key=len) + " "
                
            else: new_string += new_text + " "
        else:
            new_string += new_text + " "
    return new_string

def get_char_combination(consq_char):
    n = len(list(consq_char))
    arr = [None] * n
    binary_arr = generateAllBinaryStrings(n, arr, 0, [])
    result = []
    for elem in binary_arr:
        c = ""
        c = "".join([c + elem[i]*consq_char[i] for i in range(0,n)])
        if c != "":
            result.append(c)
        else: continue
        
    return result

def generateAllBinaryStrings(n, arr, i, m_arr):  
  
    if i == n: 
        m_arr.append(arr[:])  
        return m_arr
      
    # First assign "0" at ith position  
    # and try for all other permutations  
    # for remaining positions  
    arr[i] = 0
    m_arr = generateAllBinaryStrings(n, arr, i + 1, m_arr)  
  
    # And then assign "1" at ith position  
    # and try for all other permutations  
    # for remaining positions  
    arr[i] = 1
    m_arr = generateAllBinaryStrings(n, arr, i + 1, m_arr)  
    return m_arr


def get_lemma_from_string(text, remove_stopwords=True, remove_punctuation=False):
    lemmatised_input_list_string = nlp(u'' + text)
    lemmatised_list = list()
    for token in lemmatised_input_list_string:
        if not token.lemma_ == '-PRON-':
            if remove_punctuation and remove_stopwords:
                if not token.is_stop and (token.text == '.' or not token.is_punct):
                    lemmatised_list.append(token.lemma_)
            elif remove_punctuation:
                if token.text == '.' or not token.is_punct:
                    lemmatised_list.append(token.lemma_)
            elif remove_stopwords:
                if not token.is_stop:
                    lemmatised_list.append(token.lemma_)
    lemmatised_string = " ".join(lemmatised_list)
    return lemmatised_string.strip()

def replace_string(df):
    for text in subst_string.keys():
        df['content_lemma'] = df['content_lemma'].str.replace(text, subst_string[text])
    return df

## Preprocess Content

In [17]:
train_df['content'] = train_df.apply (lambda row: process_abbword_smiley_links_hashtags_mentions(row['content']), axis=1)

In [20]:
train_df['content'] = train_df.apply (lambda row: get_string_removing_consequtive_similar_char(row['content']), axis=1)

In [21]:
train_df['content_lemma'] = train_df.apply (lambda row: get_lemma_from_string(row['content']), axis=1)

In [22]:
train_df = replace_string(train_df)

In [27]:
train_df['content_len'] = train_df['content'].str.len()  # Store string length of each sample
train_df['content_lemma_len'] = train_df['content_lemma'].str.len()  # Store string length of each sample


In [28]:
train_df

Unnamed: 0,content,sentiment,sentiment_labels,content_lemma,content_len,content_lemma_len
8,charlene my love. i miss you,sadness,10,charlene love . miss,29,20
14,got the news,surprise,11,get news,13,8
15,the storm is here and the electricity is gone,sadness,10,storm electricity go,46,20
16,agreed,love,7,agree,7,5
17,so sleepy again and it's not even that late. i...,sadness,10,sleepy late . fail .,64,20
...,...,...,...,...,...,...
28934,hanging with my cousin jimmy then hopefully ha...,fun,4,hang cousin jimmy hopefully hang friend,67,39
28936,with alex,sadness,10,alex,10,4
28937,that is comedy good luck my friend!,happiness,5,comedy good luck friend !,37,27
28938,stephs grad party gr8! shoved cake in her face...,fun,4,stephs grad party gr8 ! shoved cake face wat...,102,96


## --------------------------------------------IF YOU NEED TO SPLIT DF-----------------------------------------

from fsplit.filesplit import FileSplit
fs = FileSplit(file='new_data/train_df.csv', splitsize=50000, output_dir='new_data/splits/')
fs.split(include_header=True)
from os import walk

for (dirpath, dirnames, filenames) in walk('new_data/splits/'):
    for file in filenames:
        df = pd.read_csv('new_data/splits/'+file)
        df['content'] = df.apply (lambda row: get_string_removing_consequtive_similar_char(row['content']), axis=1)
        print('done', file)
        df.to_csv('new_data/cleaned/'+file)

## ----------------------------------------------------------------------------------------------------------------------------

In [29]:
train_df.to_csv('data/new_data/train.csv')