In [7]:
import pandas as pd
from nltk.corpus import stopwords
from wordcloud import STOPWORDS,WordCloud
import numpy as np
import string, os, re
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer


In [8]:
df = pd.read_csv('../data/transcribed_text.csv')

In [9]:
df.head()

Unnamed: 0,game_id,text
0,adunit-ihop-window4-reeses-mpu,You
1,919ef5d6de221daf94537e4d87e98859,
2,6a8e741867d4f893afad015b77b52c39,
3,ec53b0973db5a35d83fd5bb009802bdb,
4,db671d7ebafdc3b7259109fbc18eaac9,I'm so for you. Far and love. With the highly...


In [10]:
df.isnull().sum()

game_id      0
text       173
dtype: int64

In [11]:
df.fillna(' ', inplace=True)

In [12]:
df.head()

Unnamed: 0,game_id,text
0,adunit-ihop-window4-reeses-mpu,You
1,919ef5d6de221daf94537e4d87e98859,
2,6a8e741867d4f893afad015b77b52c39,
3,ec53b0973db5a35d83fd5bb009802bdb,
4,db671d7ebafdc3b7259109fbc18eaac9,I'm so for you. Far and love. With the highly...


In [13]:

def preprocess_first(text):
    # Remove the stop words to prepare the word clouds
    stopWords = set(STOPWORDS)
    # stopWords.update(["RT","https","will","the"])
    
    # Regex patterns
    urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    

    # remove stop words
    text = text.apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stopWords))    
    # remove @username 
    text = text.apply(lambda words: ' '.join(re.sub(urlPattern,' ',word) for word in words.split() ))
    # remove all urls 
    text = text.apply(lambda words: ' '.join(re.sub(userPattern,' ', word) for word in words.split() ))
    # Replace 3 or more consecutive letters by 2 letter.
    text = text.str.replace('[^a-zA-Z\s]', ' ')
    #     /^[a-zA-Z\s]*$/g
    text = text.apply(lambda words: ' '.join(re.sub(sequencePattern, seqReplacePattern, word) for word in words.split() ))
    # remove characters and non-english letters
    
    
    return text

In [14]:
text = df['text']

In [15]:
df.text.dtype

dtype('O')

In [16]:
clean_text =  preprocess_first(text)



In [17]:
clean_text

0                                                    you
1                                                       
2                                                       
3                                                       
4      i m you far love with highly awarded alcerdifi...
                             ...                        
508                                                  you
509                                                     
510                                                     
511                                                  you
512                                                     
Name: text, Length: 513, dtype: object

In [18]:

# Functions for data cleaning
wnl = WordNetLemmatizer()


def lemmatize(myWord):
    """Function to lemmatize words"""
    if myWord is None:
        return myWord
    else:
        return str(wnl.lemmatize(myWord))


def prepText(myWord):
    """Final text pre-processing function"""
    return lemmatize(
                myWord.lower()
        )
    
def filterTextList(textList):
    """ lemmatize, and clean all tweets"""
    return [[prepText(word) for word
                in text.split()
                    if prepText(word) is not None]
                for text in textList]

In [19]:
# applying the cleaning function
stopWords = set(STOPWORDS)
clean_textList = filterTextList(text)

In [20]:
#Converting texts to list of words For feature engineering
sentence_list = [text_each for text_each in text]
word_list = [sent.split() for sent in sentence_list]

In [21]:
# showing the list of words
print('\nPlain Sentence: ' + text.values[12] + '\n')
print('Generated List: \n'+ str(word_list[12]))
print('')


Plain Sentence:  Can't take my eyes off of you. Far and love. With the highly awarded, El certified by Lex's collection of pre-em vehicles. Exclusively are your Lex's dealer.

Generated List: 
["Can't", 'take', 'my', 'eyes', 'off', 'of', 'you.', 'Far', 'and', 'love.', 'With', 'the', 'highly', 'awarded,', 'El', 'certified', 'by', "Lex's", 'collection', 'of', 'pre-em', 'vehicles.', 'Exclusively', 'are', 'your', "Lex's", 'dealer.']



In [22]:
def text_category(p):
    v = TextBlob(p).sentiment.polarity
    if v > 0:
        return 'positive'
    elif v < 0:
        return 'negative'
    else:
        return 'neutral'

In [23]:
df['clean_text'] = text

In [24]:
# import nltk
# nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/owon/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [25]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import operator
sia = SentimentIntensityAnalyzer()
df["sentiment_score"] = df["clean_text"].apply(lambda x: sia.polarity_scores(x)["compound"])
df["sentiment"] = np.select([df["sentiment_score"] < 0, df["sentiment_score"] == 0, df["sentiment_score"] > 0],
                           ['negative', 'neutral', 'positive'])

In [27]:
df.sentiment

0       neutral
1       neutral
2       neutral
3       neutral
4      positive
         ...   
508     neutral
509     neutral
510     neutral
511     neutral
512     neutral
Name: sentiment, Length: 513, dtype: object

In [33]:
df['word_count'] = df['clean_text'].str.split().str.len()

In [28]:
df.sentiment.value_counts()

neutral     334
positive    152
negative     27
Name: sentiment, dtype: int64

In [35]:
df[['game_id', 'sentiment_score','sentiment','word_count']].to_csv('../data/sentiment.csv',index=False)