## R for text analysis
#### Data Management (Spring/Summer 2018) at OSIPP, Osaka U

### Preamble

In [None]:
library(RMeCab) #install.packages("RMeCab", repos = "http://rmecab.jp/R")
library(data.table)
library(stringr)
library(ggplot2)
#library(MASS)
library(FactoMineR) #install.packages("FactoMineR",repos='http://cran.us.r-project.org')
library(factoextra) #install.packages("factoextra",repos='http://cran.us.r-project.org')

In [2]:
rm(list = ls())

In [19]:
#setwd("") # set the path to your local directory.

In [20]:
options(repr.plot.width=7,repr.plot.height=7) # set figure size

### Import data

In [None]:
orig_data <- fread("syuuin_speech_tpp2017.csv",sep=',',encoding='UTF-8',data.table=FALSE,stringsAsFactors=FALSE) # need to encode
print(head(orig_data))

### Split into speaker names and speeches
- There are max 20 speeches in a meeting and 20 meetings are included in the data.

In [None]:
text <- t(sapply(1:20, function(y) str_split_fixed(orig_data[y,1:26],"　", 2))) # split into speaker names and speeches
# convert from utf-8 to shift_jis
name <- data.frame(t(sapply(1:20, function(y) iconv(text[y,7:26],from="UTF-8",to="Shift_JIS"))))
speech <- data.frame(t(sapply(1:20, function(y) iconv(text[y,33:52],from="UTF-8",to="Shift_JIS"))))
#validEnc(as.character(speech[1,1]))
print(dim(name)) # row = meetings, col = speekers
print(dim(speech))
print(name[20,2])  # 20th meeting, 2nd speaker
print(speech[20,2]) # 20th meeting, 2nd speaker

### Split speeches into words

In [None]:
data <- sapply(1:20, function(y) RMeCabDF(speech,y,0)) # 0 returns texts as they are spoken, 1 returns basic forms
print(class(data[1,]))
print(dim(data))
print(length(unlist(data))) # words
print(length(unique(unlist(data)))) # unique words
print(data[20,2])  # 20th meeting, 2nd speaker

### Count words

In [None]:
text_counts <- NULL
for (i in 1:length(data)) { # length data = 20 meetings * 20 speekers (including NA)
    if(any(is.na(data[[i]]))) {
        text_counts[i] <- NA
    }else{
        text_counts[i] <- length(data[[i]])
    }
}
#print(text_counts)
print(summary(text_counts))

### Plot histogram

In [None]:
df_text_counts <- data.frame(text_counts)

ggplot(data=df_text_counts,aes(x=df_text_counts)) + 
    geom_histogram(binwidth=100,alpha=0.3,fill="black") +
    scale_y_continuous(breaks=seq(0,1250,by=200)) +
    labs(title="Title",x="",y="") +
    theme(
        panel.background = element_rect(fill = NA),
        panel.border = element_rect(fill = NA, color = "grey75"),
        axis.ticks = element_line(color = "grey85"),
        #panel.grid.major = element_line(color = "grey95", size = 0.2),
        #panel.grid.minor = element_line(color = "grey95", size = 0.2),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, size=9),
        axis.title = element_text(size=9),
        axis.text = element_text(size=9))

### Frequency table of PoS

In [None]:
table(names(unlist(data)))

### Separate by PoS

In [13]:
pos <- rep( list(list()), 13 ) 

for (i in 1:length(data)) {
    pos[[1]][[i]] <- data[[i]][names(data[[i]])=="名詞"]
    pos[[2]][[i]] <- data[[i]][names(data[[i]])=="動詞"]
    pos[[3]][[i]] <- data[[i]][names(data[[i]])=="形容詞"]
    pos[[4]][[i]] <- data[[i]][names(data[[i]])=="感動詞"]
    pos[[5]][[i]] <- data[[i]][names(data[[i]])=="副詞"]
    pos[[6]][[i]] <- data[[i]][names(data[[i]])=="助動詞"]
    pos[[7]][[i]] <- data[[i]][names(data[[i]])=="助詞"]
    pos[[8]][[i]] <- data[[i]][names(data[[i]])=="接続詞"]
    pos[[9]][[i]] <- data[[i]][names(data[[i]])=="接頭詞"]
    pos[[10]][[i]] <- data[[i]][names(data[[i]])=="連体詞"]
    pos[[11]][[i]] <- data[[i]][names(data[[i]])=="その他"]
    pos[[12]][[i]] <- data[[i]][names(data[[i]])=="フィラー"]
    pos[[13]][[i]] <- data[[i]][names(data[[i]])=="記号"]
}

In [None]:
print(length(unlist(pos[[1]]))) # count nouns
print(length(unique(unlist(pos[[1]])))) # count unique nouns
# print(pos[[1]])

### Words used by a specific politician

In [None]:
print(data[which(name == "河野国務大臣")])

#### - Most frequent words

In [None]:
data_unlist <- table(unlist(data[which(name == "河野国務大臣")]))
print(data_unlist[order(data_unlist,decreasing=TRUE)][1:50]) 

#### - Most frequent nouns

In [None]:
pol_word <- table(unlist(pos[[1]][which(name == "河野国務大臣")]))
print(pol_word[order(pol_word, decreasing=TRUE)][1:50]) # most frequent nouns

#### - Remove some words from the list of nouns

In [None]:
remove_words <- c("こと", "の", "化", "それ")
pol_word2 <- pol_word[!names(pol_word) %in% remove_words]
print(pol_word2[order(pol_word2,decreasing=TRUE)][1:50])

### Correspondence Analysis

#### - Select all Daijin

In [None]:
print(length(unique(unlist(name))))
speakers <- unique(unlist(name))
print(speakers)
daijin <- speakers[grepl("大臣",speakers)]
print(daijin)

#### - Select nouns for each daijin

In [None]:
# select nouns
daijin_table <- sapply(1:8, function(y) table(unlist(pos[[1]][which(name == as.character(daijin[y]))])))
# select top 100 nouns
daijin_table_top <- lapply(1:8, function(y) daijin_table[[y]][order(daijin_table[[y]],decreasing=TRUE)][1:100])
print(daijin_table_top[[2]]) 

#### - Bind data

In [None]:
daijin_speech_data <- rbind(
    data.frame(word = names(daijin_table_top[[1]]),id = rep("斉藤", length(daijin_table_top[[1]])), freq = as.data.frame(daijin_table_top[[1]])$Freq),
    data.frame(word = names(daijin_table_top[[2]]),id = rep("菅", length(daijin_table_top[[2]])), freq = as.data.frame(daijin_table_top[[2]])$Freq),
    data.frame(word = names(daijin_table_top[[3]]),id = rep("茂木", length(daijin_table_top[[3]])), freq = as.data.frame(daijin_table_top[[3]])$Freq),
    data.frame(word = names(daijin_table_top[[4]]),id = rep("松山", length(daijin_table_top[[4]])), freq = as.data.frame(daijin_table_top[[4]])$Freq),
    data.frame(word = names(daijin_table_top[[5]]),id = rep("世耕", length(daijin_table_top[[5]])), freq = as.data.frame(daijin_table_top[[5]])$Freq),
    data.frame(word = names(daijin_table_top[[6]]),id = rep("梶山", length(daijin_table_top[[6]])), freq = as.data.frame(daijin_table_top[[6]])$Freq),
    data.frame(word = names(daijin_table_top[[7]]),id = rep("河野", length(daijin_table_top[[7]])), freq = as.data.frame(daijin_table_top[[7]])$Freq),
    data.frame(word = names(daijin_table_top[[8]]),id = rep("石井", length(daijin_table_top[[8]])), freq = as.data.frame(daijin_table_top[[8]])$Freq)
)

daijin_speech <- xtabs(freq ~ word + id, data=daijin_speech_data)  # cross-tabulation
print(daijin_speech)
print(rownames(daijin_speech))

#### - Plot

In [None]:
daijin_ca <- CA(daijin_speech,graph=FALSE)    # Correspondence Analysis in FactoMineR package

fviz_ca(daijin_ca,                          　 # fviz_ca in factoextra package
        col.row="steelblue", col.col="red", 
        alpha.row=0.5, 
        labelsize=4,
        select.row=list(contrib=100)) +
        theme(
            panel.background = element_rect(fill=NA),
            panel.border = element_rect(fill=NA,color="grey75"),
            axis.ticks = element_line(color="grey85"),
            panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            legend.position = "none",
            plot.title = element_text(hjust = 0.5,size=9),
            axis.title = element_text(size=9),
            axis.text = element_text(size=9))

#### Check Kono Daijin's top nouns

In [None]:
print(daijin_table_top[[7]])

In [None]:
print(unlist(speech)[which(name == "河野国務大臣")])

In [51]:
#windowsFonts()
#windowsFonts(gothic=windowsFont("MS Gothic")) 