In [41]:
import pandas as pd
import datetime

from pattern.nl import sentiment
import numpy as np

from plotnine import ggplot, aes, geom_vline, geom_histogram

import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
def read_file(csv_file):
    """
    params:
    csv_file: file with 'message' as column name for text
    
    """
    df = pd.read_csv(csv_file, delimiter=';').fillna('None')
    
    # Remove rows where messages are empty or character length smaller than 10
    df = df[(df['message'] != 'None') | (df['message'].apply(lambda x: len(x) >= 10))]
    # drop duplicates
    df = df.drop_duplicates(subset='message')  
    
    return df

In [43]:
def df_preprocessing(df, platform):
    """
    params:
    df: dataframe
    platform: type of platform (facebook, twitter, etc.)
    """
    if(platform == 'facebook'):
        
        # Rename columns
        df = df.rename(columns={'like.summary.total_count': 'like_count', 
                  'love.summary.total_count': 'love_count',
                  'haha.summary.total_count': 'haha_count',
                  'wow.summary.total_count': 'wow_count',
                  'sad.summary.total_count': 'sad_count',
                  'angry.summary.total_count': 'angry_count'
                  })
        
        # Reformat date
        df['query_time'] = df['query_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').date().strftime('%Y-%m-%d') if x != 'None' else 'None')
        df['created_time'] = df['created_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S%z').date().strftime('%Y-%m-%d') if x != 'None' else 'None')
        
    return df    

In [44]:
def calculate_sentiment(df):
    """
    params:
    df: dataframe
    """
    # Each word in the lexicon has scores for:
    # 1)     polarity: negative vs. positive    (-1.0 => +1.0)
    # 2) subjectivity: objective vs. subjective (+0.0 => +1.0)
    # 3)    intensity: modifies next word?      (x0.5 => x2.0)
    
    # Add columns sentiment and subjectivity
    df['sentiment'] = df.apply(lambda x: sentiment(x.message)[0], 1)
    df['subjectivity'] = df.apply(lambda x: sentiment(x.message)[1], 1) 
    
    return df

In [45]:
def sentiment_stats(df):
    """
    params:
    df: dataframe
    """
    # Means of sentiment and subjectivity
    sent_mean = df['sentiment'].mean()
    subj_mean = df['subjectivity'].mean()
    
    print("Sentiment mean:", sent_mean)
    print("Subjectivity mean", subj_mean)
    
    # Plot sentiment scores
    plt_sent = ggplot(df, aes(x='sentiment')) + geom_histogram(bins = 30, color = 'black', fill = 'gray') + geom_vline(
                aes(xintercept=sent_mean), 
                linetype='dashed', size=0.6)
    print(plt_sent)
    
    plt_subj = ggplot(df, aes(x='subjectivity')) + geom_histogram(bins = 30, color = 'black', fill = 'gray') + geom_vline(
                aes(xintercept=subj_mean), 
                linetype='dashed', size=0.6)
    print(plt_subj)    

In [46]:
def preprocess_text(text):
    text = text.lower()
    # Remove urls
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    # Remove punctuation and symbols
    text = re.sub(r'[^\w\s]', '', text)    
    # Remove numbers
    text = re.sub(r'[0-9]', '', text)    
    return text

In [None]:
tolower = TRUE, 
remove_numbers = TRUE, 
remove_punct = TRUE, 
remove_url = TRUE, 
remove_symbols = TRUE,
remove = c(stop_vec,emoji_vec, stopwords('dutch'),stopwords('english')),
stem = TRUE

In [None]:
stopwords_nl = nltk.corpus.stopwords.words('dutch')
stopwords_en = nltk.corpus.stopwords.words('english')
more extensive list of dutch stopwords:
mystopwords <- read.table("stop_words_dutch.txt", header = TRUE)
class(mystopwords)
stop_vec = as.vector(mystopwords$Custom_stopwords)
class(stop_vec)
#Also drop basic English stopwords
stopwords('english')
#List of emojis & symbols
emoji <- read.table("stop_words_emojis.txt", header=TRUE)
emoji_vec = as.vector(emoji$Emojis)

In [49]:
def create_dfm(df):
    stopwords = nltk.corpus.stopwords.words('dutch')

    cv = CountVectorizer(stop_words=frozenset(lijst))
    X = cv.fit_transform(df['message'].values)
    result = pd.DataFrame(data=X.toarray(), columns=cv.get_feature_names())
    print(result)

In [None]:
corpus = ['He is a good person',
          'He is bad student',
          'He is hardworking']
df = pd.DataFrame(data=corpus, columns=['sentences'])

vectorizer = CountVectorizer(vocabulary=['he', 'is', 'a', 'good', 'person', 'bad', 'student', 'hardworking'], min_df=0,
                             stop_words=frozenset(), token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(df['sentences'].values)
result = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())
print(result)

In [None]:
## Create a text corpus
quantedacorpus <- corpus(fb,
                         text_field = "message",
                         meta = list("id", "created_time", "comment_count","Like_count" ),
                         unique_docnames = TRUE)
#Prepare data
#Stopwords
stopwords('dutch')
#more extensive list of dutch stopwords:
mystopwords <- read.table("stop_words_dutch.txt", header = TRUE)
class(mystopwords)
stop_vec = as.vector(mystopwords$Custom_stopwords)
class(stop_vec)
#Also drop basic English stopwords
stopwords('english')
#List of emojis & symbols
emoji <- read.table("stop_words_emojis.txt", header=TRUE)
emoji_vec = as.vector(emoji$Emojis)


# Create document-frequency-matrix for topic model
dfm_fb <- dfm(quantedacorpus, 
                  tolower = TRUE, 
                  remove_numbers = TRUE, 
                  remove_punct = TRUE, 
                  remove_url = TRUE, 
                  remove_symbols = TRUE,
                  remove = c(stop_vec,emoji_vec, stopwords('dutch'),stopwords('english')),
                  stem = TRUE)

######Explore
#Get most frequent terms
topfeatures(
  dfm_fb,
  n = 200,
  decreasing = TRUE,
  scheme = c("count", "docfreq"),
  groups = NULL
)

In [52]:
def main():
    
#     parser = argparse.ArgumentParser()
#     parser.add_argument("test_file", help = "Path to feature file of data classifier is tried on", type = str)
#     parser.add_argument("output_path", help = "Path to folder whit the output of the classifier on the data", type = str)
    
#     args = parser.parse_args(['--file', '/path/to/sequences.txt'])
    csv_file = 'FB_NOS_NU_Telegraaf_NRC_all_endFeb.csv'
    platform = 'facebook'
    df = read_file(csv_file)
    df = df_preprocessing(df, platform)
#     df = calculate_sentiment(df)
#     sentiment_stats(df)
#     create_dfm(df)
#     write_evaluations(args.test_file, args.output_path)
    

if __name__ == '__main__':
    
    main()

In [None]:

#*************************************************************************
## Split up the corpus in positive and negative comments
#
# --> maybe rather one corpus with most positive and negative comments?!
#
#fb_s.POSITIVE <- subset(fb_s, Sentiment >= 0.20)
#fb_s.NEGATIVE <- subset(fb_s, Sentiment <= -0.20)

#fb_s.POSITIVE$message <- as.character(fb_s.POSITIVE$message)
#fb_s.NEGATIVE$message <- as.character(fb_s.NEGATIVE$message)
##
#*********************************************************************************************************************
#              High Subjectivity Comments
#*********************************************************************************************************************
# Get long comments with high sentiment (+/-) AND high subjectivity score

fb_subjective <- fb_s[(fb_s$Sentiment >= 0.25) | (fb_s$Sentiment <= -0.25), ]

fb_subjective <- subset(fb_subjective, Subjectivity >= 0.4)

#Drop rows with very short comments
fb_subjective$message <- as.character(fb_subjective$message)
fb_subjective = fb_subjective[(which(nchar(fb_subjective$message) >= 250)),]

#Drop less important columns
fb_subjective = subset(fb_subjective, select = -c(X.U.FEFF.level, object_type,query_status,query_time, 
                                                  query_type) )

#Sort by Subjectivity decreasing before start reading comments
fb_subjective <-fb_subjective[order(fb_subjective$Subjectivity, decreasing = TRUE),]
head(fb_subjective)

#Subset for Teun for intercoder reliability 
fb_subset <-fb_subjective[121:250,]

install.packages("writexl")
library("writexl")

In [None]:
## Create a text corpus
quantedacorpus <- corpus(fb,
                         text_field = "message",
                         meta = list("id", "created_time", "comment_count","Like_count" ),
                         unique_docnames = TRUE)
#Prepare data
#Stopwords
stopwords('dutch')
#more extensive list of dutch stopwords:
mystopwords <- read.table("stop_words_dutch.txt", header = TRUE)
class(mystopwords)
stop_vec = as.vector(mystopwords$Custom_stopwords)
class(stop_vec)
#Also drop basic English stopwords
stopwords('english')
#List of emojis & symbols
emoji <- read.table("stop_words_emojis.txt", header=TRUE)
emoji_vec = as.vector(emoji$Emojis)


# Create document-frequency-matrix for topic model
dfm_fb <- dfm(quantedacorpus, 
                  tolower = TRUE, 
                  remove_numbers = TRUE, 
                  remove_punct = TRUE, 
                  remove_url = TRUE, 
                  remove_symbols = TRUE,
                  remove = c(stop_vec,emoji_vec, stopwords('dutch'),stopwords('english')),
                  stem = TRUE)

######Explore
#Get most frequent terms
topfeatures(
  dfm_fb,
  n = 200,
  decreasing = TRUE,
  scheme = c("count", "docfreq"),
  groups = NULL
)

#Look at keywords in context
head(kwic(quantedacorpus, pattern = "go*", window = 3, valuetype = "glob"))
head(kwic(quantedacorpus, pattern = "allergi*", window = 6, valuetype = "glob"), n=15)
allergy <- data.frame(head(kwic(quantedacorpus, pattern = "allergi*", window = 7, valuetype = "glob"), n=50))
allergy

head(kwic(quantedacorpus, pattern = "bloedverdunner*", window = 6, valuetype = "glob"), n=15)
head(kwic(quantedacorpus, pattern = "zwanger*", window = 6, valuetype = "glob"), n=15)
head(kwic(quantedacorpus, pattern = "kinderwens", window = 6, valuetype = "glob"), n=15)


#Read sample patient reports
options(max.print=1000)
print(as.character(quantedacorpus[30]))
print(as.character(quantedacorpus[152]))
print(as.character(quantedacorpus[1540]))

#most frequent bigrams
#install.packages("tidytext")
#library(tidytext)
#library(dplyr)
#?unnest_tokens()
#bifb <- fb %>% unnest_tokens(ngram,message,token = "ngrams", n = 2)
#bifb %>% count(ngram, sort = TRUE)

#***********************************************************************************************************
## Further NLP: POS tagging and most frequent NOUN / ADJ
# First clean quanteda corpus (similar to pre-processing steps as above for dfm)
qcorpus <- tokens(quantedacorpus)

qcorpus <- tokens_select(qcorpus, c(stop_vec,emoji_vec, stopwords('dutch'),stopwords('english')),selection='remove')

qcorpus <- tokens(qcorpus, remove_numbers = TRUE,  remove_punct = TRUE,remove_url = TRUE, 
                  remove_symbols = TRUE,remove_separators = FALSE)
      
######## POS tagging and keyword analysis with udpipe #####################


install.packages("udpipe")
library(udpipe)
dl <- udpipe_download_model(language = "dutch")
str(dl)

udmodel_dutch <- udpipe_load_model(file = "dutch-alpino-ud-2.5-191206.udpipe")

txt <- sapply(qcorpus, FUN=function(x){
  x <- gsub(" ", intToUtf8(160), x) ## replace space with no-break-space
  paste(x, collapse = " ")
})
x <- udpipe_annotate(udmodel_dutch, x = as.character(txt), tokenizer = "horizontal",parser = "none")
x <- as.data.frame(x)
str(x)
table(x$upos)

library(lattice)
stats <- subset(x, upos %in% c("NOUN")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most occurring nouns", xlab = "Freq")

stats <- subset(x, upos %in% c("VERB")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most occurring verbs", xlab = "Freq")

stats <- subset(x, upos %in% c("ADJ")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most occurring adjectives", xlab = "Freq")


### Extract top keyword NOUN - ADJ / NOUN - VERB combinations
## Collocation (words following one another)
stats <- keywords_collocation(x = x, 
                              term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                              ngram_max = 4)
collocs <- stats[order(-stats$freq),]
collocs$keyword <- factor(collocs$keyword, levels = rev(collocs$keyword))
barchart(keyword ~ freq, data = head(collocs, 20), col = "cadetblue", 
         main = "Most occurring collocations", xlab = "Freq")

## Co-occurrences: How frequent do words occur in the same sentence, in this case only nouns or adjectives
stats <- cooccurrence(x = subset(x, upos %in% c("NOUN", "ADJ")), 
                      term = "lemma", group = c("doc_id", "paragraph_id", "sentence_id"))

co_occur <- stats[order(-stats$cooc),]


## Co-occurrences: How frequent do words follow one another
stats <- cooccurrence(x = x$lemma, 
                      relevant = x$upos %in% c("NOUN", "ADJ"))

co_occur <- data.frame(stats[order(-stats$cooc),])
library(tidyr)
co_occur <- co_occur %>% unite("words", term1:term2, sep = " ", remove = FALSE)
co_occur$words <- factor(co_occur$words, levels = rev(co_occur$words))
barchart(words ~ cooc, data = head(co_occur, 20), col = "cadetblue", 
         main = "Most occurring co-occurences", xlab = "Freq")


## Co-occurrences: How frequent do words follow one another even if we would skip 2 words in between
stats <- cooccurrence(x = x$lemma, 
                      relevant = x$upos %in% c("NOUN", "ADJ"), skipgram = 2)
head(stats)

## Visualizations
library(igraph)
library(ggraph)
library(ggplot2)
wordnetwork <- head(stats, 70)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
  geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "pink") +
  geom_node_text(aes(label = name), col = "darkgreen", size = 4) +
  theme_graph(base_family = "Arial Narrow") +
  theme(legend.position = "none") +
  labs(title = "Cooccurrences within 3 words distance", subtitle = "Nouns & Adjective")

## Using RAKE
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", 
                       relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "red", 
         main = "Keywords identified by RAKE", 
         xlab = "Rake")